aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/stable/sysfs-block14
-rw-r--r--Documentation/admin-guide/md.rst86
-rw-r--r--Documentation/filesystems/locking.rst2
-rw-r--r--Documentation/scsi/scsi_mid_low_api.rst8
-rw-r--r--arch/m68k/emu/nfblock.c4
-rw-r--r--arch/um/drivers/ubd_kern.c6
-rw-r--r--block/bfq-iosched.c22
-rw-r--r--block/bio-integrity.c25
-rw-r--r--block/bio.c76
-rw-r--r--block/blk-cgroup.c29
-rw-r--r--block/blk-cgroup.h12
-rw-r--r--block/blk-core.c19
-rw-r--r--block/blk-crypto-fallback.c19
-rw-r--r--block/blk-iolatency.c19
-rw-r--r--block/blk-map.c13
-rw-r--r--block/blk-merge.c85
-rw-r--r--block/blk-mq-debugfs.c1
-rw-r--r--block/blk-mq-dma.c4
-rw-r--r--block/blk-mq-sched.c14
-rw-r--r--block/blk-mq-sched.h13
-rw-r--r--block/blk-mq-sysfs.c1
-rw-r--r--block/blk-mq-tag.c128
-rw-r--r--block/blk-mq.c175
-rw-r--r--block/blk-mq.h22
-rw-r--r--block/blk-settings.c84
-rw-r--r--block/blk-sysfs.c60
-rw-r--r--block/blk-throttle.c15
-rw-r--r--block/blk-throttle.h18
-rw-r--r--block/blk.h46
-rw-r--r--block/elevator.c3
-rw-r--r--block/elevator.h2
-rw-r--r--block/fops.c10
-rw-r--r--block/ioctl.c4
-rw-r--r--block/kyber-iosched.c19
-rw-r--r--block/mq-deadline.c20
-rw-r--r--block/partitions/ibm.c2
-rw-r--r--drivers/ata/libata-scsi.c4
-rw-r--r--drivers/block/amiflop.c10
-rw-r--r--drivers/block/aoe/aoeblk.c4
-rw-r--r--drivers/block/aoe/aoemain.c2
-rw-r--r--drivers/block/floppy.c13
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c6
-rw-r--r--drivers/block/nbd.c10
-rw-r--r--drivers/block/null_blk/main.c2
-rw-r--r--drivers/block/rbd.c2
-rw-r--r--drivers/block/rnbd/rnbd-clt.c6
-rw-r--r--drivers/block/sunvdc.c7
-rw-r--r--drivers/block/swim.c4
-rw-r--r--drivers/block/ublk_drv.c231
-rw-r--r--drivers/block/virtio_blk.c8
-rw-r--r--drivers/block/xen-blkfront.c4
-rw-r--r--drivers/block/zram/zram_drv.c2
-rw-r--r--drivers/md/Kconfig29
-rw-r--r--drivers/md/Makefile4
-rw-r--r--drivers/md/bcache/debug.c3
-rw-r--r--drivers/md/bcache/io.c3
-rw-r--r--drivers/md/bcache/journal.c2
-rw-r--r--drivers/md/bcache/movinggc.c8
-rw-r--r--drivers/md/bcache/super.c2
-rw-r--r--drivers/md/bcache/writeback.c8
-rw-r--r--drivers/md/dm-bufio.c2
-rw-r--r--drivers/md/dm-flakey.c2
-rw-r--r--drivers/md/dm-raid.c18
-rw-r--r--drivers/md/dm-vdo/vio.c2
-rw-r--r--drivers/md/dm.c4
-rw-r--r--drivers/md/md-bitmap.c89
-rw-r--r--drivers/md/md-bitmap.h107
-rw-r--r--drivers/md/md-cluster.c2
-rw-r--r--drivers/md/md-linear.c14
-rw-r--r--drivers/md/md-llbitmap.c1626
-rw-r--r--drivers/md/md.c382
-rw-r--r--drivers/md/md.h24
-rw-r--r--drivers/md/raid0.c30
-rw-r--r--drivers/md/raid1-10.c2
-rw-r--r--drivers/md/raid1.c119
-rw-r--r--drivers/md/raid1.h4
-rw-r--r--drivers/md/raid10.c107
-rw-r--r--drivers/md/raid10.h2
-rw-r--r--drivers/md/raid5.c74
-rw-r--r--drivers/memstick/core/ms_block.c4
-rw-r--r--drivers/memstick/core/mspro_block.c4
-rw-r--r--drivers/message/fusion/mptscsih.c2
-rw-r--r--drivers/message/fusion/mptscsih.h2
-rw-r--r--drivers/mmc/core/block.c4
-rw-r--r--drivers/mtd/mtd_blkdevs.c4
-rw-r--r--drivers/mtd/ubi/block.c4
-rw-r--r--drivers/nvdimm/btt.c4
-rw-r--r--drivers/nvme/host/core.c4
-rw-r--r--drivers/nvme/host/ioctl.c5
-rw-r--r--drivers/nvme/host/nvme.h2
-rw-r--r--drivers/nvme/host/pci.c21
-rw-r--r--drivers/s390/block/dasd.c7
-rw-r--r--drivers/scsi/3w-9xxx.c2
-rw-r--r--drivers/scsi/3w-sas.c2
-rw-r--r--drivers/scsi/3w-xxxx.c2
-rw-r--r--drivers/scsi/BusLogic.c4
-rw-r--r--drivers/scsi/BusLogic.h2
-rw-r--r--drivers/scsi/aacraid/linit.c6
-rw-r--r--drivers/scsi/advansys.c2
-rw-r--r--drivers/scsi/aha152x.c4
-rw-r--r--drivers/scsi/aha1542.c2
-rw-r--r--drivers/scsi/aha1740.c2
-rw-r--r--drivers/scsi/aic7xxx/aic79xx_osm.c4
-rw-r--r--drivers/scsi/aic7xxx/aic7xxx_osm.c4
-rw-r--r--drivers/scsi/arcmsr/arcmsr_hba.c6
-rw-r--r--drivers/scsi/atp870u.c2
-rw-r--r--drivers/scsi/fdomain.c4
-rw-r--r--drivers/scsi/imm.c2
-rw-r--r--drivers/scsi/initio.c4
-rw-r--r--drivers/scsi/ipr.c8
-rw-r--r--drivers/scsi/ips.c2
-rw-r--r--drivers/scsi/ips.h2
-rw-r--r--drivers/scsi/libsas/sas_scsi_host.c2
-rw-r--r--drivers/scsi/megaraid.c4
-rw-r--r--drivers/scsi/megaraid.h2
-rw-r--r--drivers/scsi/megaraid/megaraid_sas_base.c4
-rw-r--r--drivers/scsi/mpi3mr/mpi3mr_os.c4
-rw-r--r--drivers/scsi/mpt3sas/mpt3sas_scsih.c4
-rw-r--r--drivers/scsi/mvumi.c2
-rw-r--r--drivers/scsi/myrb.c2
-rw-r--r--drivers/scsi/pcmcia/sym53c500_cs.c2
-rw-r--r--drivers/scsi/ppa.c2
-rw-r--r--drivers/scsi/qla1280.c2
-rw-r--r--drivers/scsi/qlogicfas408.c2
-rw-r--r--drivers/scsi/qlogicfas408.h2
-rw-r--r--drivers/scsi/scsicam.c16
-rw-r--r--drivers/scsi/sd.c8
-rw-r--r--drivers/scsi/stex.c2
-rw-r--r--drivers/scsi/storvsc_drv.c2
-rw-r--r--drivers/scsi/wd719x.c2
-rw-r--r--drivers/target/target_core_pscsi.c2
-rw-r--r--fs/bcachefs/btree_io.c2
-rw-r--r--fs/bcachefs/data_update.h1
-rw-r--r--fs/bcachefs/journal.c6
-rw-r--r--fs/bcachefs/journal_io.c2
-rw-r--r--fs/bcachefs/super-io.c2
-rw-r--r--fs/iomap/direct-io.c5
-rw-r--r--fs/squashfs/block.c2
-rw-r--r--include/linux/bio-integrity.h1
-rw-r--r--include/linux/bio.h18
-rw-r--r--include/linux/blk-integrity.h15
-rw-r--r--include/linux/blk-mq-dma.h11
-rw-r--r--include/linux/blk-mq.h4
-rw-r--r--include/linux/blk_types.h21
-rw-r--r--include/linux/blkdev.h26
-rw-r--r--include/linux/libata.h2
-rw-r--r--include/linux/uio.h2
-rw-r--r--include/scsi/libsas.h2
-rw-r--r--include/scsi/scsi_host.h2
-rw-r--r--include/scsi/scsicam.h7
-rw-r--r--lib/iov_iter.c95
-rw-r--r--tools/testing/selftests/ublk/Makefile1
-rw-r--r--tools/testing/selftests/ublk/kublk.c32
-rwxr-xr-xtools/testing/selftests/ublk/test_generic_01.sh4
-rwxr-xr-xtools/testing/selftests/ublk/test_generic_02.sh4
-rwxr-xr-xtools/testing/selftests/ublk/test_generic_12.sh4
-rwxr-xr-xtools/testing/selftests/ublk/test_generic_13.sh20
-rwxr-xr-xtools/testing/selftests/ublk/test_null_01.sh4
-rwxr-xr-xtools/testing/selftests/ublk/test_null_02.sh4
-rwxr-xr-xtools/testing/selftests/ublk/test_stress_05.sh4
160 files changed, 3311 insertions, 1255 deletions
diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
index 0ddffc9133d0..0ed10aeff86b 100644
--- a/Documentation/ABI/stable/sysfs-block
+++ b/Documentation/ABI/stable/sysfs-block
@@ -603,16 +603,10 @@ Date: July 2003
Contact: linux-block@vger.kernel.org
Description:
[RW] This controls how many requests may be allocated in the
- block layer for read or write requests. Note that the total
- allocated number may be twice this amount, since it applies only
- to reads or writes (not the accumulated sum).
-
- To avoid priority inversion through request starvation, a
- request queue maintains a separate request pool per each cgroup
- when CONFIG_BLK_CGROUP is enabled, and this parameter applies to
- each such per-block-cgroup request pool. IOW, if there are N
- block cgroups, each request queue may have up to N request
- pools, each independently regulated by nr_requests.
+ block layer. Noted this value only represents the quantity for a
+ single blk_mq_tags instance. The actual number for the entire
+ device depends on the hardware queue count, whether elevator is
+ enabled, and whether tags are shared.
What: /sys/block/<disk>/queue/nr_zones
diff --git a/Documentation/admin-guide/md.rst b/Documentation/admin-guide/md.rst
index 4ff2cc291d18..1c2eacc94758 100644
--- a/Documentation/admin-guide/md.rst
+++ b/Documentation/admin-guide/md.rst
@@ -347,6 +347,54 @@ All md devices contain:
active-idle
like active, but no writes have been seen for a while (safe_mode_delay).
+ consistency_policy
+ This indicates how the array maintains consistency in case of unexpected
+ shutdown. It can be:
+
+ none
+ Array has no redundancy information, e.g. raid0, linear.
+
+ resync
+ Full resync is performed and all redundancy is regenerated when the
+ array is started after unclean shutdown.
+
+ bitmap
+ Resync assisted by a write-intent bitmap.
+
+ journal
+ For raid4/5/6, journal device is used to log transactions and replay
+ after unclean shutdown.
+
+ ppl
+ For raid5 only, Partial Parity Log is used to close the write hole and
+ eliminate resync.
+
+ The accepted values when writing to this file are ``ppl`` and ``resync``,
+ used to enable and disable PPL.
+
+ uuid
+ This indicates the UUID of the array in the following format:
+ xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+
+ bitmap_type
+ [RW] When read, this file will display the current and available
+ bitmap for this array. The currently active bitmap will be enclosed
+ in [] brackets. Writing an bitmap name or ID to this file will switch
+ control of this array to that new bitmap. Note that writing a new
+ bitmap for created array is forbidden.
+
+ none
+ No bitmap
+ bitmap
+ The default internal bitmap
+ llbitmap
+ The lockless internal bitmap
+
+If bitmap_type is not none, then additional bitmap attributes bitmap/xxx or
+llbitmap/xxx will be created after md device KOBJ_CHANGE event.
+
+If bitmap_type is bitmap, then the md device will also contain:
+
bitmap/location
This indicates where the write-intent bitmap for the array is
stored.
@@ -401,35 +449,23 @@ All md devices contain:
once the array becomes non-degraded, and this fact has been
recorded in the metadata.
- consistency_policy
- This indicates how the array maintains consistency in case of unexpected
- shutdown. It can be:
-
- none
- Array has no redundancy information, e.g. raid0, linear.
-
- resync
- Full resync is performed and all redundancy is regenerated when the
- array is started after unclean shutdown.
-
- bitmap
- Resync assisted by a write-intent bitmap.
+If bitmap_type is llbitmap, then the md device will also contain:
- journal
- For raid4/5/6, journal device is used to log transactions and replay
- after unclean shutdown.
+ llbitmap/bits
+ This is read-only, show status of bitmap bits, the number of each
+ value.
- ppl
- For raid5 only, Partial Parity Log is used to close the write hole and
- eliminate resync.
-
- The accepted values when writing to this file are ``ppl`` and ``resync``,
- used to enable and disable PPL.
+ llbitmap/metadata
+ This is read-only, show bitmap metadata, include chunksize, chunkshift,
+ chunks, offset and daemon_sleep.
- uuid
- This indicates the UUID of the array in the following format:
- xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+ llbitmap/daemon_sleep
+ This is read-write, time in seconds that daemon function will be
+ triggered to clear dirty bits.
+ llbitmap/barrier_idle
+ This is read-write, time in seconds that page barrier will be idled,
+ means dirty bits in the page will be cleared.
As component devices are added to an md array, they appear in the ``md``
directory as new directories named::
diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index aa287ccdac2f..77704fde9845 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -443,7 +443,7 @@ prototypes::
int (*direct_access) (struct block_device *, sector_t, void **,
unsigned long *);
void (*unlock_native_capacity) (struct gendisk *);
- int (*getgeo)(struct block_device *, struct hd_geometry *);
+ int (*getgeo)(struct gendisk *, struct hd_geometry *);
void (*swap_slot_free_notify) (struct block_device *, unsigned long);
locking rules:
diff --git a/Documentation/scsi/scsi_mid_low_api.rst b/Documentation/scsi/scsi_mid_low_api.rst
index 3ac4c7fafb55..634f5c28a849 100644
--- a/Documentation/scsi/scsi_mid_low_api.rst
+++ b/Documentation/scsi/scsi_mid_low_api.rst
@@ -380,7 +380,7 @@ Details::
/**
* scsi_bios_ptable - return copy of block device's partition table
- * @dev: pointer to block device
+ * @dev: pointer to gendisk
*
* Returns pointer to partition table, or NULL for failure
*
@@ -390,7 +390,7 @@ Details::
*
* Defined in: drivers/scsi/scsicam.c
**/
- unsigned char *scsi_bios_ptable(struct block_device *dev)
+ unsigned char *scsi_bios_ptable(struct gendisk *dev)
/**
@@ -623,7 +623,7 @@ Details::
* bios_param - fetch head, sector, cylinder info for a disk
* @sdev: pointer to scsi device context (defined in
* include/scsi/scsi_device.h)
- * @bdev: pointer to block device context (defined in fs.h)
+ * @disk: pointer to gendisk (defined in blkdev.h)
* @capacity: device size (in 512 byte sectors)
* @params: three element array to place output:
* params[0] number of heads (max 255)
@@ -643,7 +643,7 @@ Details::
*
* Optionally defined in: LLD
**/
- int bios_param(struct scsi_device * sdev, struct block_device *bdev,
+ int bios_param(struct scsi_device * sdev, struct gendisk *disk,
sector_t capacity, int params[3])
diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c
index 83410f8184ec..94a4fadc651a 100644
--- a/arch/m68k/emu/nfblock.c
+++ b/arch/m68k/emu/nfblock.c
@@ -77,9 +77,9 @@ static void nfhd_submit_bio(struct bio *bio)
bio_endio(bio);
}
-static int nfhd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+static int nfhd_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
- struct nfhd_device *dev = bdev->bd_disk->private_data;
+ struct nfhd_device *dev = disk->private_data;
geo->cylinders = dev->blocks >> (6 - dev->bshift);
geo->heads = 4;
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 4de6613e7468..f2b2feeeb455 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -108,7 +108,7 @@ static DEFINE_MUTEX(ubd_lock);
static int ubd_ioctl(struct block_device *bdev, blk_mode_t mode,
unsigned int cmd, unsigned long arg);
-static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo);
+static int ubd_getgeo(struct gendisk *disk, struct hd_geometry *geo);
#define MAX_DEV (16)
@@ -1324,9 +1324,9 @@ static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx,
return res;
}
-static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+static int ubd_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
- struct ubd *ubd_dev = bdev->bd_disk->private_data;
+ struct ubd *ubd_dev = disk->private_data;
geo->heads = 128;
geo->sectors = 32;
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 50e51047e1fe..4a8d3d96bfe4 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -7109,9 +7109,10 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
* See the comments on bfq_limit_depth for the purpose of
* the depths set in the function. Return minimum shallow depth we'll use.
*/
-static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
+static void bfq_depth_updated(struct request_queue *q)
{
- unsigned int nr_requests = bfqd->queue->nr_requests;
+ struct bfq_data *bfqd = q->elevator->elevator_data;
+ unsigned int nr_requests = q->nr_requests;
/*
* In-word depths if no bfq_queue is being weight-raised:
@@ -7143,21 +7144,8 @@ static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
bfqd->async_depths[1][0] = max((nr_requests * 3) >> 4, 1U);
/* no more than ~37% of tags for sync writes (~20% extra tags) */
bfqd->async_depths[1][1] = max((nr_requests * 6) >> 4, 1U);
-}
-
-static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx)
-{
- struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
- struct blk_mq_tags *tags = hctx->sched_tags;
- bfq_update_depths(bfqd, &tags->bitmap_tags);
- sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1);
-}
-
-static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index)
-{
- bfq_depth_updated(hctx);
- return 0;
+ blk_mq_set_min_shallow_depth(q, 1);
}
static void bfq_exit_queue(struct elevator_queue *e)
@@ -7369,6 +7357,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq)
goto out_free;
bfq_init_root_group(bfqd->root_group, bfqd);
bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
+ bfq_depth_updated(q);
/* We dispatch from request queue wide instead of hw queue */
blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
@@ -7628,7 +7617,6 @@ static struct elevator_type iosched_bfq_mq = {
.request_merged = bfq_request_merged,
.has_work = bfq_has_work,
.depth_updated = bfq_depth_updated,
- .init_hctx = bfq_init_hctx,
.init_sched = bfq_init_queue,
.exit_sched = bfq_exit_queue,
},
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 6b077ca937f6..bed26f1ec869 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -230,7 +230,8 @@ static int bio_integrity_init_user(struct bio *bio, struct bio_vec *bvec,
}
static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages,
- int nr_vecs, ssize_t bytes, ssize_t offset)
+ int nr_vecs, ssize_t bytes, ssize_t offset,
+ bool *is_p2p)
{
unsigned int nr_bvecs = 0;
int i, j;
@@ -251,6 +252,9 @@ static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages,
bytes -= next;
}
+ if (is_pci_p2pdma_page(pages[i]))
+ *is_p2p = true;
+
bvec_set_page(&bvec[nr_bvecs], pages[i], size, offset);
offset = 0;
nr_bvecs++;
@@ -262,13 +266,13 @@ static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages,
int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter)
{
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- unsigned int align = blk_lim_dma_alignment_and_pad(&q->limits);
struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages;
struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec;
+ iov_iter_extraction_t extraction_flags = 0;
size_t offset, bytes = iter->count;
+ bool copy, is_p2p = false;
unsigned int nr_bvecs;
int ret, nr_vecs;
- bool copy;
if (bio_integrity(bio))
return -EINVAL;
@@ -285,16 +289,25 @@ int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter)
pages = NULL;
}
- copy = !iov_iter_is_aligned(iter, align, align);
- ret = iov_iter_extract_pages(iter, &pages, bytes, nr_vecs, 0, &offset);
+ copy = iov_iter_alignment(iter) &
+ blk_lim_dma_alignment_and_pad(&q->limits);
+
+ if (blk_queue_pci_p2pdma(q))
+ extraction_flags |= ITER_ALLOW_P2PDMA;
+
+ ret = iov_iter_extract_pages(iter, &pages, bytes, nr_vecs,
+ extraction_flags, &offset);
if (unlikely(ret < 0))
goto free_bvec;
- nr_bvecs = bvec_from_pages(bvec, pages, nr_vecs, bytes, offset);
+ nr_bvecs = bvec_from_pages(bvec, pages, nr_vecs, bytes, offset,
+ &is_p2p);
if (pages != stack_pages)
kvfree(pages);
if (nr_bvecs > queue_max_integrity_segments(q))
copy = true;
+ if (is_p2p)
+ bio->bi_opf |= REQ_NOMERGE;
if (copy)
ret = bio_integrity_copy_user(bio, bvec, nr_bvecs, bytes);
diff --git a/block/bio.c b/block/bio.c
index 44c43b970387..3a1a848940dd 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -261,7 +261,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
bio->bi_private = NULL;
#ifdef CONFIG_BLK_CGROUP
bio->bi_blkg = NULL;
- bio->bi_issue.value = 0;
+ bio->issue_time_ns = 0;
if (bdev)
bio_associate_blkg(bio);
#ifdef CONFIG_BLK_CGROUP_IOCOST
@@ -462,7 +462,10 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
cache->nr--;
put_cpu();
- bio_init(bio, bdev, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs, opf);
+ if (nr_vecs)
+ bio_init_inline(bio, bdev, nr_vecs, opf);
+ else
+ bio_init(bio, bdev, NULL, nr_vecs, opf);
bio->bi_pool = bs;
return bio;
}
@@ -578,7 +581,7 @@ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
bio_init(bio, bdev, bvl, nr_vecs, opf);
} else if (nr_vecs) {
- bio_init(bio, bdev, bio->bi_inline_vecs, BIO_INLINE_VECS, opf);
+ bio_init_inline(bio, bdev, BIO_INLINE_VECS, opf);
} else {
bio_init(bio, bdev, NULL, 0, opf);
}
@@ -614,7 +617,8 @@ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask)
if (nr_vecs > BIO_MAX_INLINE_VECS)
return NULL;
- return kmalloc(struct_size(bio, bi_inline_vecs, nr_vecs), gfp_mask);
+ return kmalloc(sizeof(*bio) + nr_vecs * sizeof(struct bio_vec),
+ gfp_mask);
}
EXPORT_SYMBOL(bio_kmalloc);
@@ -1227,13 +1231,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue))
extraction_flags |= ITER_ALLOW_P2PDMA;
- /*
- * Each segment in the iov is required to be a block size multiple.
- * However, we may not be able to get the entire segment if it spans
- * more pages than bi_max_vecs allows, so we have to ALIGN_DOWN the
- * result to ensure the bio's total size is correct. The remainder of
- * the iov data will be picked up in the next bio iteration.
- */
size = iov_iter_extract_pages(iter, &pages,
UINT_MAX - bio->bi_iter.bi_size,
nr_pages, extraction_flags, &offset);
@@ -1241,18 +1238,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
return size ? size : -EFAULT;
nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
-
- if (bio->bi_bdev) {
- size_t trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1);
- iov_iter_revert(iter, trim);
- size -= trim;
- }
-
- if (unlikely(!size)) {
- ret = -EFAULT;
- goto out;
- }
-
for (left = size, i = 0; left > 0; left -= len, i += num_pages) {
struct page *page = pages[i];
struct folio *folio = page_folio(page);
@@ -1297,10 +1282,44 @@ out:
return ret;
}
+/*
+ * Aligns the bio size to the len_align_mask, releasing excessive bio vecs that
+ * __bio_iov_iter_get_pages may have inserted, and reverts the trimmed length
+ * for the next iteration.
+ */
+static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter,
+ unsigned len_align_mask)
+{
+ size_t nbytes = bio->bi_iter.bi_size & len_align_mask;
+
+ if (!nbytes)
+ return 0;
+
+ iov_iter_revert(iter, nbytes);
+ bio->bi_iter.bi_size -= nbytes;
+ do {
+ struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
+
+ if (nbytes < bv->bv_len) {
+ bv->bv_len -= nbytes;
+ break;
+ }
+
+ bio_release_page(bio, bv->bv_page);
+ bio->bi_vcnt--;
+ nbytes -= bv->bv_len;
+ } while (nbytes);
+
+ if (!bio->bi_vcnt)
+ return -EFAULT;
+ return 0;
+}
+
/**
- * bio_iov_iter_get_pages - add user or kernel pages to a bio
+ * bio_iov_iter_get_pages_aligned - add user or kernel pages to a bio
* @bio: bio to add pages to
* @iter: iov iterator describing the region to be added
+ * @len_align_mask: the mask to align the total size to, 0 for any length
*
* This takes either an iterator pointing to user memory, or one pointing to
* kernel pages (BVEC iterator). If we're adding user pages, we pin them and
@@ -1317,7 +1336,8 @@ out:
* MM encounters an error pinning the requested pages, it stops. Error
* is returned only if 0 pages could be pinned.
*/
-int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
+int bio_iov_iter_get_pages_aligned(struct bio *bio, struct iov_iter *iter,
+ unsigned len_align_mask)
{
int ret = 0;
@@ -1336,9 +1356,11 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
ret = __bio_iov_iter_get_pages(bio, iter);
} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
- return bio->bi_vcnt ? 0 : ret;
+ if (bio->bi_vcnt)
+ return bio_iov_iter_align_down(bio, iter, len_align_mask);
+ return ret;
}
-EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
+EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages_aligned);
static void submit_bio_wait_endio(struct bio *bio)
{
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index fe9ebd6a2e14..f93de34fe87d 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -110,12 +110,6 @@ static struct cgroup_subsys_state *blkcg_css(void)
return task_css(current, io_cgrp_id);
}
-static bool blkcg_policy_enabled(struct request_queue *q,
- const struct blkcg_policy *pol)
-{
- return pol && test_bit(pol->plid, q->blkcg_pols);
-}
-
static void blkg_free_workfn(struct work_struct *work)
{
struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
@@ -883,14 +877,8 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
disk = ctx->bdev->bd_disk;
q = disk->queue;
- /*
- * blkcg_deactivate_policy() requires queue to be frozen, we can grab
- * q_usage_counter to prevent concurrent with blkcg_deactivate_policy().
- */
- ret = blk_queue_enter(q, 0);
- if (ret)
- goto fail;
-
+ /* Prevent concurrent with blkcg_deactivate_policy() */
+ mutex_lock(&q->blkcg_mutex);
spin_lock_irq(&q->queue_lock);
if (!blkcg_policy_enabled(q, pol)) {
@@ -920,16 +908,16 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
/* Drop locks to do new blkg allocation with GFP_KERNEL. */
spin_unlock_irq(&q->queue_lock);
- new_blkg = blkg_alloc(pos, disk, GFP_KERNEL);
+ new_blkg = blkg_alloc(pos, disk, GFP_NOIO);
if (unlikely(!new_blkg)) {
ret = -ENOMEM;
- goto fail_exit_queue;
+ goto fail_exit;
}
if (radix_tree_preload(GFP_KERNEL)) {
blkg_free(new_blkg);
ret = -ENOMEM;
- goto fail_exit_queue;
+ goto fail_exit;
}
spin_lock_irq(&q->queue_lock);
@@ -957,7 +945,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
goto success;
}
success:
- blk_queue_exit(q);
+ mutex_unlock(&q->blkcg_mutex);
ctx->blkg = blkg;
return 0;
@@ -965,9 +953,8 @@ fail_preloaded:
radix_tree_preload_end();
fail_unlock:
spin_unlock_irq(&q->queue_lock);
-fail_exit_queue:
- blk_queue_exit(q);
-fail:
+fail_exit:
+ mutex_unlock(&q->blkcg_mutex);
/*
* If queue was bypassing, we should retry. Do so after a
* short msleep(). It isn't strictly necessary but queue
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 81868ad86330..1cce3294634d 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -370,11 +370,6 @@ static inline void blkg_put(struct blkcg_gq *blkg)
if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \
(p_blkg)->q)))
-static inline void blkcg_bio_issue_init(struct bio *bio)
-{
- bio_issue_init(&bio->bi_issue, bio_sectors(bio));
-}
-
static inline void blkcg_use_delay(struct blkcg_gq *blkg)
{
if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
@@ -459,6 +454,12 @@ static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio)
bio_issue_as_root_blkg(rq->bio) == bio_issue_as_root_blkg(bio);
}
+static inline bool blkcg_policy_enabled(struct request_queue *q,
+ const struct blkcg_policy *pol)
+{
+ return pol && test_bit(pol->plid, q->blkcg_pols);
+}
+
void blk_cgroup_bio_start(struct bio *bio);
void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta);
#else /* CONFIG_BLK_CGROUP */
@@ -491,7 +492,6 @@ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
static inline void blkg_get(struct blkcg_gq *blkg) { }
static inline void blkg_put(struct blkcg_gq *blkg) { }
-static inline void blkcg_bio_issue_init(struct bio *bio) { }
static inline void blk_cgroup_bio_start(struct bio *bio) { }
static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; }
diff --git a/block/blk-core.c b/block/blk-core.c
index 4201504158a1..dd39ff651095 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -539,7 +539,7 @@ static inline void bio_check_ro(struct bio *bio)
}
}
-static noinline int should_fail_bio(struct bio *bio)
+int should_fail_bio(struct bio *bio)
{
if (should_fail_request(bdev_whole(bio->bi_bdev), bio->bi_iter.bi_size))
return -EIO;
@@ -725,10 +725,9 @@ static void __submit_bio_noacct_mq(struct bio *bio)
current->bio_list = NULL;
}
-void submit_bio_noacct_nocheck(struct bio *bio)
+void submit_bio_noacct_nocheck(struct bio *bio, bool split)
{
blk_cgroup_bio_start(bio);
- blkcg_bio_issue_init(bio);
if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
trace_block_bio_queue(bio);
@@ -745,12 +744,16 @@ void submit_bio_noacct_nocheck(struct bio *bio)
* to collect a list of requests submited by a ->submit_bio method while
* it is active, and then process them after it returned.
*/
- if (current->bio_list)
- bio_list_add(&current->bio_list[0], bio);
- else if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO))
+ if (current->bio_list) {
+ if (split)
+ bio_list_add_head(&current->bio_list[0], bio);
+ else
+ bio_list_add(&current->bio_list[0], bio);
+ } else if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) {
__submit_bio_noacct_mq(bio);
- else
+ } else {
__submit_bio_noacct(bio);
+ }
}
static blk_status_t blk_validate_atomic_write_op_size(struct request_queue *q,
@@ -871,7 +874,7 @@ void submit_bio_noacct(struct bio *bio)
if (blk_throtl_bio(bio))
return;
- submit_bio_noacct_nocheck(bio);
+ submit_bio_noacct_nocheck(bio, false);
return;
not_supported:
diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c
index 005c9157ffb3..86b27f96051a 100644
--- a/block/blk-crypto-fallback.c
+++ b/block/blk-crypto-fallback.c
@@ -167,8 +167,7 @@ static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src)
bio = bio_kmalloc(nr_segs, GFP_NOIO);
if (!bio)
return NULL;
- bio_init(bio, bio_src->bi_bdev, bio->bi_inline_vecs, nr_segs,
- bio_src->bi_opf);
+ bio_init_inline(bio, bio_src->bi_bdev, nr_segs, bio_src->bi_opf);
if (bio_flagged(bio_src, BIO_REMAPPED))
bio_set_flag(bio, BIO_REMAPPED);
bio->bi_ioprio = bio_src->bi_ioprio;
@@ -222,18 +221,14 @@ static bool blk_crypto_fallback_split_bio_if_needed(struct bio **bio_ptr)
if (++i == BIO_MAX_VECS)
break;
}
- if (num_sectors < bio_sectors(bio)) {
- struct bio *split_bio;
- split_bio = bio_split(bio, num_sectors, GFP_NOIO,
- &crypto_bio_split);
- if (IS_ERR(split_bio)) {
- bio->bi_status = BLK_STS_RESOURCE;
+ if (num_sectors < bio_sectors(bio)) {
+ bio = bio_submit_split_bioset(bio, num_sectors,
+ &crypto_bio_split);
+ if (!bio)
return false;
- }
- bio_chain(split_bio, bio);
- submit_bio_noacct(bio);
- *bio_ptr = split_bio;
+
+ *bio_ptr = bio;
}
return true;
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 2f8fdecdd7a9..45bd18f68541 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -485,19 +485,11 @@ static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio)
mod_timer(&blkiolat->timer, jiffies + HZ);
}
-static void iolatency_record_time(struct iolatency_grp *iolat,
- struct bio_issue *issue, u64 now,
- bool issue_as_root)
+static void iolatency_record_time(struct iolatency_grp *iolat, u64 start,
+ u64 now, bool issue_as_root)
{
- u64 start = bio_issue_time(issue);
u64 req_time;
- /*
- * Have to do this so we are truncated to the correct time that our
- * issue is truncated to.
- */
- now = __bio_issue_time(now);
-
if (now <= start)
return;
@@ -625,7 +617,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
* submitted, so do not account for it.
*/
if (iolat->min_lat_nsec && bio->bi_status != BLK_STS_AGAIN) {
- iolatency_record_time(iolat, &bio->bi_issue, now,
+ iolatency_record_time(iolat, bio->issue_time_ns, now,
issue_as_root);
window_start = atomic64_read(&iolat->window_start);
if (now > window_start &&
@@ -750,10 +742,15 @@ static void blkiolatency_enable_work_fn(struct work_struct *work)
*/
enabled = atomic_read(&blkiolat->enable_cnt);
if (enabled != blkiolat->enabled) {
+ struct request_queue *q = blkiolat->rqos.disk->queue;
unsigned int memflags;
memflags = blk_mq_freeze_queue(blkiolat->rqos.disk->queue);
blkiolat->enabled = enabled;
+ if (enabled)
+ blk_queue_flag_set(QUEUE_FLAG_BIO_ISSUE_TIME, q);
+ else
+ blk_queue_flag_clear(QUEUE_FLAG_BIO_ISSUE_TIME, q);
blk_mq_unfreeze_queue(blkiolat->rqos.disk->queue, memflags);
}
}
diff --git a/block/blk-map.c b/block/blk-map.c
index 23e5d5ebe59e..165f2234f00f 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -157,7 +157,7 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data,
bio = bio_kmalloc(nr_pages, gfp_mask);
if (!bio)
goto out_bmd;
- bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, req_op(rq));
+ bio_init_inline(bio, NULL, nr_pages, req_op(rq));
if (map_data) {
nr_pages = 1U << map_data->page_order;
@@ -253,10 +253,11 @@ static void blk_mq_map_bio_put(struct bio *bio)
static struct bio *blk_rq_map_bio_alloc(struct request *rq,
unsigned int nr_vecs, gfp_t gfp_mask)
{
+ struct block_device *bdev = rq->q->disk ? rq->q->disk->part0 : NULL;
struct bio *bio;
if (rq->cmd_flags & REQ_ALLOC_CACHE && (nr_vecs <= BIO_INLINE_VECS)) {
- bio = bio_alloc_bioset(NULL, nr_vecs, rq->cmd_flags, gfp_mask,
+ bio = bio_alloc_bioset(bdev, nr_vecs, rq->cmd_flags, gfp_mask,
&fs_bio_set);
if (!bio)
return NULL;
@@ -264,7 +265,7 @@ static struct bio *blk_rq_map_bio_alloc(struct request *rq,
bio = bio_kmalloc(nr_vecs, gfp_mask);
if (!bio)
return NULL;
- bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, req_op(rq));
+ bio_init_inline(bio, bdev, nr_vecs, req_op(rq));
}
return bio;
}
@@ -326,7 +327,7 @@ static struct bio *bio_map_kern(void *data, unsigned int len, enum req_op op,
bio = bio_kmalloc(nr_vecs, gfp_mask);
if (!bio)
return ERR_PTR(-ENOMEM);
- bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, op);
+ bio_init_inline(bio, NULL, nr_vecs, op);
if (is_vmalloc_addr(data)) {
bio->bi_private = data;
if (!bio_add_vmalloc(bio, data, len)) {
@@ -392,7 +393,7 @@ static struct bio *bio_copy_kern(void *data, unsigned int len, enum req_op op,
bio = bio_kmalloc(nr_pages, gfp_mask);
if (!bio)
return ERR_PTR(-ENOMEM);
- bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, op);
+ bio_init_inline(bio, NULL, nr_pages, op);
while (len) {
struct page *page;
@@ -443,7 +444,7 @@ int blk_rq_append_bio(struct request *rq, struct bio *bio)
int ret;
/* check that the data layout matches the hardware restrictions */
- ret = bio_split_rw_at(bio, lim, &nr_segs, max_bytes);
+ ret = bio_split_io_at(bio, lim, &nr_segs, max_bytes, 0);
if (ret) {
/* if we would have to split the bio, copy instead */
if (ret > 0)
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 70d704615be5..37864c5d287e 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -104,34 +104,58 @@ static unsigned int bio_allowed_max_sectors(const struct queue_limits *lim)
return round_down(UINT_MAX, lim->logical_block_size) >> SECTOR_SHIFT;
}
+/*
+ * bio_submit_split_bioset - Submit a bio, splitting it at a designated sector
+ * @bio: the original bio to be submitted and split
+ * @split_sectors: the sector count at which to split
+ * @bs: the bio set used for allocating the new split bio
+ *
+ * The original bio is modified to contain the remaining sectors and submitted.
+ * The caller is responsible for submitting the returned bio.
+ *
+ * If succeed, the newly allocated bio representing the initial part will be
+ * returned, on failure NULL will be returned and original bio will fail.
+ */
+struct bio *bio_submit_split_bioset(struct bio *bio, unsigned int split_sectors,
+ struct bio_set *bs)
+{
+ struct bio *split = bio_split(bio, split_sectors, GFP_NOIO, bs);
+
+ if (IS_ERR(split)) {
+ bio->bi_status = errno_to_blk_status(PTR_ERR(split));
+ bio_endio(bio);
+ return NULL;
+ }
+
+ bio_chain(split, bio);
+ trace_block_split(split, bio->bi_iter.bi_sector);
+ WARN_ON_ONCE(bio_zone_write_plugging(bio));
+
+ if (should_fail_bio(bio))
+ bio_io_error(bio);
+ else if (!blk_throtl_bio(bio))
+ submit_bio_noacct_nocheck(bio, true);
+
+ return split;
+}
+EXPORT_SYMBOL_GPL(bio_submit_split_bioset);
+
static struct bio *bio_submit_split(struct bio *bio, int split_sectors)
{
- if (unlikely(split_sectors < 0))
- goto error;
+ if (unlikely(split_sectors < 0)) {
+ bio->bi_status = errno_to_blk_status(split_sectors);
+ bio_endio(bio);
+ return NULL;
+ }
if (split_sectors) {
- struct bio *split;
-
- split = bio_split(bio, split_sectors, GFP_NOIO,
+ bio = bio_submit_split_bioset(bio, split_sectors,
&bio->bi_bdev->bd_disk->bio_split);
- if (IS_ERR(split)) {
- split_sectors = PTR_ERR(split);
- goto error;
- }
- split->bi_opf |= REQ_NOMERGE;
- blkcg_bio_issue_init(split);
- bio_chain(split, bio);
- trace_block_split(split, bio->bi_iter.bi_sector);
- WARN_ON_ONCE(bio_zone_write_plugging(bio));
- submit_bio_noacct(bio);
- return split;
+ if (bio)
+ bio->bi_opf |= REQ_NOMERGE;
}
return bio;
-error:
- bio->bi_status = errno_to_blk_status(split_sectors);
- bio_endio(bio);
- return NULL;
}
struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim,
@@ -279,25 +303,30 @@ static unsigned int bio_split_alignment(struct bio *bio,
}
/**
- * bio_split_rw_at - check if and where to split a read/write bio
+ * bio_split_io_at - check if and where to split a bio
* @bio: [in] bio to be split
* @lim: [in] queue limits to split based on
* @segs: [out] number of segments in the bio with the first half of the sectors
* @max_bytes: [in] maximum number of bytes per bio
+ * @len_align_mask: [in] length alignment mask for each vector
*
* Find out if @bio needs to be split to fit the queue limits in @lim and a
* maximum size of @max_bytes. Returns a negative error number if @bio can't be
* split, 0 if the bio doesn't have to be split, or a positive sector offset if
* @bio needs to be split.
*/
-int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim,
- unsigned *segs, unsigned max_bytes)
+int bio_split_io_at(struct bio *bio, const struct queue_limits *lim,
+ unsigned *segs, unsigned max_bytes, unsigned len_align_mask)
{
struct bio_vec bv, bvprv, *bvprvp = NULL;
struct bvec_iter iter;
unsigned nsegs = 0, bytes = 0;
bio_for_each_bvec(bv, bio, iter) {
+ if (bv.bv_offset & lim->dma_alignment ||
+ bv.bv_len & len_align_mask)
+ return -EINVAL;
+
/*
* If the queue doesn't support SG gaps and adding this
* offset would create a gap, disallow it.
@@ -339,8 +368,16 @@ split:
* Individual bvecs might not be logical block aligned. Round down the
* split size so that each bio is properly block size aligned, even if
* we do not use the full hardware limits.
+ *
+ * It is possible to submit a bio that can't be split into a valid io:
+ * there may either be too many discontiguous vectors for the max
+ * segments limit, or contain virtual boundary gaps without having a
+ * valid block sized split. A zero byte result means one of those
+ * conditions occured.
*/
bytes = ALIGN_DOWN(bytes, bio_split_alignment(bio, lim));
+ if (!bytes)
+ return -EINVAL;
/*
* Bio splitting may cause subtle trouble such as hang when doing sync
@@ -350,7 +387,7 @@ split:
bio_clear_polled(bio);
return bytes >> SECTOR_SHIFT;
}
-EXPORT_SYMBOL_GPL(bio_split_rw_at);
+EXPORT_SYMBOL_GPL(bio_split_io_at);
struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
unsigned *nr_segs)
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 32c65efdda46..4896525b1c05 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -96,6 +96,7 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(DISABLE_WBT_DEF),
QUEUE_FLAG_NAME(NO_ELV_SWITCH),
QUEUE_FLAG_NAME(QOS_ENABLED),
+ QUEUE_FLAG_NAME(BIO_ISSUE_TIME),
};
#undef QUEUE_FLAG_NAME
diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
index 660b5e200ccf..449950029872 100644
--- a/block/blk-mq-dma.c
+++ b/block/blk-mq-dma.c
@@ -174,6 +174,10 @@ static bool blk_dma_map_iter_start(struct request *req, struct device *dma_dev,
switch (pci_p2pdma_state(&iter->p2pdma, dma_dev,
phys_to_page(vec.paddr))) {
case PCI_P2PDMA_MAP_BUS_ADDR:
+ if (iter->iter.is_integrity)
+ bio_integrity(req->bio)->bip_flags |= BIP_P2P_DMA;
+ else
+ req->cmd_flags |= REQ_P2PDMA;
return blk_dma_map_bus(iter, &vec);
case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
/*
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index e2ce4a28e6c9..d06bb137a743 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -454,7 +454,7 @@ void blk_mq_free_sched_tags_batch(struct xarray *et_table,
}
struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
- unsigned int nr_hw_queues)
+ unsigned int nr_hw_queues, unsigned int nr_requests)
{
unsigned int nr_tags;
int i;
@@ -470,13 +470,8 @@ struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
nr_tags * sizeof(struct blk_mq_tags *), gfp);
if (!et)
return NULL;
- /*
- * Default to double of smaller one between hw queue_depth and
- * 128, since we don't split into sync/async like the old code
- * did. Additionally, this is a per-hw queue depth.
- */
- et->nr_requests = 2 * min_t(unsigned int, set->queue_depth,
- BLKDEV_DEFAULT_RQ);
+
+ et->nr_requests = nr_requests;
et->nr_hw_queues = nr_hw_queues;
if (blk_mq_is_shared_tags(set->flags)) {
@@ -521,7 +516,8 @@ int blk_mq_alloc_sched_tags_batch(struct xarray *et_table,
* concurrently.
*/
if (q->elevator) {
- et = blk_mq_alloc_sched_tags(set, nr_hw_queues);
+ et = blk_mq_alloc_sched_tags(set, nr_hw_queues,
+ blk_mq_default_nr_requests(set));
if (!et)
goto out_unwind;
if (xa_insert(et_table, q->id, et, gfp))
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index b554e1d55950..8e21a6b1415d 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -24,7 +24,7 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
void blk_mq_sched_free_rqs(struct request_queue *q);
struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
- unsigned int nr_hw_queues);
+ unsigned int nr_hw_queues, unsigned int nr_requests);
int blk_mq_alloc_sched_tags_batch(struct xarray *et_table,
struct blk_mq_tag_set *set, unsigned int nr_hw_queues);
void blk_mq_free_sched_tags(struct elevator_tags *et,
@@ -92,4 +92,15 @@ static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx)
return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
}
+static inline void blk_mq_set_min_shallow_depth(struct request_queue *q,
+ unsigned int depth)
+{
+ struct blk_mq_hw_ctx *hctx;
+ unsigned long i;
+
+ queue_for_each_hw_ctx(q, hctx, i)
+ sbitmap_queue_min_shallow_depth(&hctx->sched_tags->bitmap_tags,
+ depth);
+}
+
#endif
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 5c399ac562ea..58ec293373c6 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -34,7 +34,6 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj)
struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx,
kobj);
- blk_free_flush_queue(hctx->fq);
sbitmap_free(&hctx->ctx_map);
free_cpumask_var(hctx->cpumask);
kfree(hctx->ctxs);
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index d880c50629d6..c7a4d4b9cc87 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -8,6 +8,9 @@
*/
#include <linux/kernel.h>
#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/kmemleak.h>
#include <linux/delay.h>
#include "blk.h"
@@ -253,13 +256,10 @@ static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags,
unsigned int bitnr)
{
struct request *rq;
- unsigned long flags;
- spin_lock_irqsave(&tags->lock, flags);
rq = tags->rqs[bitnr];
if (!rq || rq->tag != bitnr || !req_ref_inc_not_zero(rq))
rq = NULL;
- spin_unlock_irqrestore(&tags->lock, flags);
return rq;
}
@@ -297,15 +297,15 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
/**
* bt_for_each - iterate over the requests associated with a hardware queue
* @hctx: Hardware queue to examine.
- * @q: Request queue to examine.
+ * @q: Request queue @hctx is associated with (@hctx->queue).
* @bt: sbitmap to examine. This is either the breserved_tags member
* or the bitmap_tags member of struct blk_mq_tags.
* @fn: Pointer to the function that will be called for each request
* associated with @hctx that has been assigned a driver tag.
- * @fn will be called as follows: @fn(@hctx, rq, @data, @reserved)
- * where rq is a pointer to a request. Return true to continue
- * iterating tags, false to stop.
- * @data: Will be passed as third argument to @fn.
+ * @fn will be called as follows: @fn(rq, @data) where rq is a
+ * pointer to a request. Return %true to continue iterating tags;
+ * %false to stop.
+ * @data: Will be passed as second argument to @fn.
* @reserved: Indicates whether @bt is the breserved_tags member or the
* bitmap_tags member of struct blk_mq_tags.
*/
@@ -371,9 +371,9 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
* @bt: sbitmap to examine. This is either the breserved_tags member
* or the bitmap_tags member of struct blk_mq_tags.
* @fn: Pointer to the function that will be called for each started
- * request. @fn will be called as follows: @fn(rq, @data,
- * @reserved) where rq is a pointer to a request. Return true
- * to continue iterating tags, false to stop.
+ * request. @fn will be called as follows: @fn(rq, @data) where rq
+ * is a pointer to a request. Return %true to continue iterating
+ * tags; %false to stop.
* @data: Will be passed as second argument to @fn.
* @flags: BT_TAG_ITER_*
*/
@@ -406,10 +406,9 @@ static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags,
* blk_mq_all_tag_iter - iterate over all requests in a tag map
* @tags: Tag map to iterate over.
* @fn: Pointer to the function that will be called for each
- * request. @fn will be called as follows: @fn(rq, @priv,
- * reserved) where rq is a pointer to a request. 'reserved'
- * indicates whether or not @rq is a reserved request. Return
- * true to continue iterating tags, false to stop.
+ * request. @fn will be called as follows: @fn(rq, @priv) where rq
+ * is a pointer to a request. Return %true to continue iterating
+ * tags; %false to stop.
* @priv: Will be passed as second argument to @fn.
*
* Caller has to pass the tag map from which requests are allocated.
@@ -424,10 +423,9 @@ void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
* blk_mq_tagset_busy_iter - iterate over all started requests in a tag set
* @tagset: Tag set to iterate over.
* @fn: Pointer to the function that will be called for each started
- * request. @fn will be called as follows: @fn(rq, @priv,
- * reserved) where rq is a pointer to a request. 'reserved'
- * indicates whether or not @rq is a reserved request. Return
- * true to continue iterating tags, false to stop.
+ * request. @fn will be called as follows: @fn(rq, @priv) where
+ * rq is a pointer to a request. Return true to continue iterating
+ * tags, false to stop.
* @priv: Will be passed as second argument to @fn.
*
* We grab one request reference before calling @fn and release it after
@@ -437,7 +435,9 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
busy_tag_iter_fn *fn, void *priv)
{
unsigned int flags = tagset->flags;
- int i, nr_tags;
+ int i, nr_tags, srcu_idx;
+
+ srcu_idx = srcu_read_lock(&tagset->tags_srcu);
nr_tags = blk_mq_is_shared_tags(flags) ? 1 : tagset->nr_hw_queues;
@@ -446,6 +446,7 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
__blk_mq_all_tag_iter(tagset->tags[i], fn, priv,
BT_TAG_ITER_STARTED);
}
+ srcu_read_unlock(&tagset->tags_srcu, srcu_idx);
}
EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
@@ -483,11 +484,10 @@ EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request);
* blk_mq_queue_tag_busy_iter - iterate over all requests with a driver tag
* @q: Request queue to examine.
* @fn: Pointer to the function that will be called for each request
- * on @q. @fn will be called as follows: @fn(hctx, rq, @priv,
- * reserved) where rq is a pointer to a request and hctx points
- * to the hardware queue associated with the request. 'reserved'
- * indicates whether or not @rq is a reserved request.
- * @priv: Will be passed as third argument to @fn.
+ * on @q. @fn will be called as follows: @fn(rq, @priv) where rq
+ * is a pointer to a request and hctx points to the hardware queue
+ * associated with the request.
+ * @priv: Will be passed as second argument to @fn.
*
* Note: if @q->tag_set is shared with other request queues then @fn will be
* called for all requests on all queues that share that tag set and not only
@@ -496,6 +496,8 @@ EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request);
void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
void *priv)
{
+ int srcu_idx;
+
/*
* __blk_mq_update_nr_hw_queues() updates nr_hw_queues and hctx_table
* while the queue is frozen. So we can use q_usage_counter to avoid
@@ -504,6 +506,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
if (!percpu_ref_tryget(&q->q_usage_counter))
return;
+ srcu_idx = srcu_read_lock(&q->tag_set->tags_srcu);
if (blk_mq_is_shared_tags(q->tag_set->flags)) {
struct blk_mq_tags *tags = q->tag_set->shared_tags;
struct sbitmap_queue *bresv = &tags->breserved_tags;
@@ -533,6 +536,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
bt_for_each(hctx, q, btags, fn, priv, false);
}
}
+ srcu_read_unlock(&q->tag_set->tags_srcu, srcu_idx);
blk_queue_exit(q);
}
@@ -562,6 +566,8 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
tags->nr_tags = total_tags;
tags->nr_reserved_tags = reserved_tags;
spin_lock_init(&tags->lock);
+ INIT_LIST_HEAD(&tags->page_list);
+
if (bt_alloc(&tags->bitmap_tags, depth, round_robin, node))
goto out_free_tags;
if (bt_alloc(&tags->breserved_tags, reserved_tags, round_robin, node))
@@ -576,63 +582,37 @@ out_free_tags:
return NULL;
}
-void blk_mq_free_tags(struct blk_mq_tags *tags)
+static void blk_mq_free_tags_callback(struct rcu_head *head)
{
- sbitmap_queue_free(&tags->bitmap_tags);
- sbitmap_queue_free(&tags->breserved_tags);
- kfree(tags);
-}
-
-int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
- struct blk_mq_tags **tagsptr, unsigned int tdepth,
- bool can_grow)
-{
- struct blk_mq_tags *tags = *tagsptr;
-
- if (tdepth <= tags->nr_reserved_tags)
- return -EINVAL;
-
- /*
- * If we are allowed to grow beyond the original size, allocate
- * a new set of tags before freeing the old one.
- */
- if (tdepth > tags->nr_tags) {
- struct blk_mq_tag_set *set = hctx->queue->tag_set;
- struct blk_mq_tags *new;
-
- if (!can_grow)
- return -EINVAL;
-
- /*
- * We need some sort of upper limit, set it high enough that
- * no valid use cases should require more.
- */
- if (tdepth > MAX_SCHED_RQ)
- return -EINVAL;
+ struct blk_mq_tags *tags = container_of(head, struct blk_mq_tags,
+ rcu_head);
+ struct page *page;
+ while (!list_empty(&tags->page_list)) {
+ page = list_first_entry(&tags->page_list, struct page, lru);
+ list_del_init(&page->lru);
/*
- * Only the sbitmap needs resizing since we allocated the max
- * initially.
+ * Remove kmemleak object previously allocated in
+ * blk_mq_alloc_rqs().
*/
- if (blk_mq_is_shared_tags(set->flags))
- return 0;
+ kmemleak_free(page_address(page));
+ __free_pages(page, page->private);
+ }
+ kfree(tags);
+}
- new = blk_mq_alloc_map_and_rqs(set, hctx->queue_num, tdepth);
- if (!new)
- return -ENOMEM;
+void blk_mq_free_tags(struct blk_mq_tag_set *set, struct blk_mq_tags *tags)
+{
+ sbitmap_queue_free(&tags->bitmap_tags);
+ sbitmap_queue_free(&tags->breserved_tags);
- blk_mq_free_map_and_rqs(set, *tagsptr, hctx->queue_num);
- *tagsptr = new;
- } else {
- /*
- * Don't need (or can't) update reserved tags here, they
- * remain static and should never need resizing.
- */
- sbitmap_queue_resize(&tags->bitmap_tags,
- tdepth - tags->nr_reserved_tags);
+ /* if tags pages is not allocated yet, free tags directly */
+ if (list_empty(&tags->page_list)) {
+ kfree(tags);
+ return;
}
- return 0;
+ call_srcu(&set->tags_srcu, &tags->rcu_head, blk_mq_free_tags_callback);
}
void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ba3a4b77f578..09f579414161 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -396,6 +396,15 @@ static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns)
#endif
}
+static inline void blk_mq_bio_issue_init(struct request_queue *q,
+ struct bio *bio)
+{
+#ifdef CONFIG_BLK_CGROUP
+ if (test_bit(QUEUE_FLAG_BIO_ISSUE_TIME, &q->queue_flags))
+ bio->issue_time_ns = blk_time_get_ns();
+#endif
+}
+
static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
struct blk_mq_tags *tags, unsigned int tag)
{
@@ -3168,6 +3177,7 @@ void blk_mq_submit_bio(struct bio *bio)
if (!bio_integrity_prep(bio))
goto queue_exit;
+ blk_mq_bio_issue_init(q, bio);
if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
goto queue_exit;
@@ -3415,7 +3425,6 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags,
struct blk_mq_tags *tags)
{
struct page *page;
- unsigned long flags;
/*
* There is no need to clear mapping if driver tags is not initialized
@@ -3439,22 +3448,12 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags,
}
}
}
-
- /*
- * Wait until all pending iteration is done.
- *
- * Request reference is cleared and it is guaranteed to be observed
- * after the ->lock is released.
- */
- spin_lock_irqsave(&drv_tags->lock, flags);
- spin_unlock_irqrestore(&drv_tags->lock, flags);
}
void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx)
{
struct blk_mq_tags *drv_tags;
- struct page *page;
if (list_empty(&tags->page_list))
return;
@@ -3478,27 +3477,20 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
}
blk_mq_clear_rq_mapping(drv_tags, tags);
-
- while (!list_empty(&tags->page_list)) {
- page = list_first_entry(&tags->page_list, struct page, lru);
- list_del_init(&page->lru);
- /*
- * Remove kmemleak object previously allocated in
- * blk_mq_alloc_rqs().
- */
- kmemleak_free(page_address(page));
- __free_pages(page, page->private);
- }
+ /*
+ * Free request pages in SRCU callback, which is called from
+ * blk_mq_free_tags().
+ */
}
-void blk_mq_free_rq_map(struct blk_mq_tags *tags)
+void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags)
{
kfree(tags->rqs);
tags->rqs = NULL;
kfree(tags->static_rqs);
tags->static_rqs = NULL;
- blk_mq_free_tags(tags);
+ blk_mq_free_tags(set, tags);
}
static enum hctx_type hctx_idx_to_type(struct blk_mq_tag_set *set,
@@ -3560,7 +3552,7 @@ static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
err_free_rqs:
kfree(tags->rqs);
err_free_tags:
- blk_mq_free_tags(tags);
+ blk_mq_free_tags(set, tags);
return NULL;
}
@@ -3590,8 +3582,6 @@ static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set,
if (node == NUMA_NO_NODE)
node = set->numa_node;
- INIT_LIST_HEAD(&tags->page_list);
-
/*
* rq_size is the size of the request plus driver payload, rounded
* to the cacheline size
@@ -3678,8 +3668,12 @@ static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
struct rq_iter_data data = {
.hctx = hctx,
};
+ int srcu_idx;
+ srcu_idx = srcu_read_lock(&hctx->queue->tag_set->tags_srcu);
blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
+ srcu_read_unlock(&hctx->queue->tag_set->tags_srcu, srcu_idx);
+
return data.has_rq;
}
@@ -3899,7 +3893,6 @@ static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags,
unsigned int queue_depth, struct request *flush_rq)
{
int i;
- unsigned long flags;
/* The hw queue may not be mapped yet */
if (!tags)
@@ -3909,15 +3902,14 @@ static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags,
for (i = 0; i < queue_depth; i++)
cmpxchg(&tags->rqs[i], flush_rq, NULL);
+}
- /*
- * Wait until all pending iteration is done.
- *
- * Request reference is cleared and it is guaranteed to be observed
- * after the ->lock is released.
- */
- spin_lock_irqsave(&tags->lock, flags);
- spin_unlock_irqrestore(&tags->lock, flags);
+static void blk_free_flush_queue_callback(struct rcu_head *head)
+{
+ struct blk_flush_queue *fq =
+ container_of(head, struct blk_flush_queue, rcu_head);
+
+ blk_free_flush_queue(fq);
}
/* hctx->ctxs will be freed in queue's release handler */
@@ -3939,6 +3931,10 @@ static void blk_mq_exit_hctx(struct request_queue *q,
if (set->ops->exit_hctx)
set->ops->exit_hctx(hctx, hctx_idx);
+ call_srcu(&set->tags_srcu, &hctx->fq->rcu_head,
+ blk_free_flush_queue_callback);
+ hctx->fq = NULL;
+
xa_erase(&q->hctx_table, hctx_idx);
spin_lock(&q->unused_hctx_lock);
@@ -3964,13 +3960,19 @@ static int blk_mq_init_hctx(struct request_queue *q,
struct blk_mq_tag_set *set,
struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
{
+ gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
+
+ hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp);
+ if (!hctx->fq)
+ goto fail;
+
hctx->queue_num = hctx_idx;
hctx->tags = set->tags[hctx_idx];
if (set->ops->init_hctx &&
set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
- goto fail;
+ goto fail_free_fq;
if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
hctx->numa_node))
@@ -3987,6 +3989,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
exit_hctx:
if (set->ops->exit_hctx)
set->ops->exit_hctx(hctx, hctx_idx);
+ fail_free_fq:
+ blk_free_flush_queue(hctx->fq);
+ hctx->fq = NULL;
fail:
return -1;
}
@@ -4038,16 +4043,10 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
- hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp);
- if (!hctx->fq)
- goto free_bitmap;
-
blk_mq_hctx_kobj_init(hctx);
return hctx;
- free_bitmap:
- sbitmap_free(&hctx->ctx_map);
free_ctxs:
kfree(hctx->ctxs);
free_cpumask:
@@ -4101,7 +4100,7 @@ struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth);
if (ret) {
- blk_mq_free_rq_map(tags);
+ blk_mq_free_rq_map(set, tags);
return NULL;
}
@@ -4129,7 +4128,7 @@ void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
{
if (tags) {
blk_mq_free_rqs(set, tags, hctx_idx);
- blk_mq_free_rq_map(tags);
+ blk_mq_free_rq_map(set, tags);
}
}
@@ -4828,6 +4827,9 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
if (ret)
goto out_free_srcu;
}
+ ret = init_srcu_struct(&set->tags_srcu);
+ if (ret)
+ goto out_cleanup_srcu;
init_rwsem(&set->update_nr_hwq_lock);
@@ -4836,7 +4838,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
sizeof(struct blk_mq_tags *), GFP_KERNEL,
set->numa_node);
if (!set->tags)
- goto out_cleanup_srcu;
+ goto out_cleanup_tags_srcu;
for (i = 0; i < set->nr_maps; i++) {
set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
@@ -4865,6 +4867,8 @@ out_free_mq_map:
}
kfree(set->tags);
set->tags = NULL;
+out_cleanup_tags_srcu:
+ cleanup_srcu_struct(&set->tags_srcu);
out_cleanup_srcu:
if (set->flags & BLK_MQ_F_BLOCKING)
cleanup_srcu_struct(set->srcu);
@@ -4910,6 +4914,9 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
kfree(set->tags);
set->tags = NULL;
+
+ srcu_barrier(&set->tags_srcu);
+ cleanup_srcu_struct(&set->tags_srcu);
if (set->flags & BLK_MQ_F_BLOCKING) {
cleanup_srcu_struct(set->srcu);
kfree(set->srcu);
@@ -4917,57 +4924,59 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
}
EXPORT_SYMBOL(blk_mq_free_tag_set);
-int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
+struct elevator_tags *blk_mq_update_nr_requests(struct request_queue *q,
+ struct elevator_tags *et,
+ unsigned int nr)
{
struct blk_mq_tag_set *set = q->tag_set;
+ struct elevator_tags *old_et = NULL;
struct blk_mq_hw_ctx *hctx;
- int ret;
unsigned long i;
- if (WARN_ON_ONCE(!q->mq_freeze_depth))
- return -EINVAL;
-
- if (!set)
- return -EINVAL;
-
- if (q->nr_requests == nr)
- return 0;
-
blk_mq_quiesce_queue(q);
- ret = 0;
- queue_for_each_hw_ctx(q, hctx, i) {
- if (!hctx->tags)
- continue;
+ if (blk_mq_is_shared_tags(set->flags)) {
/*
- * If we're using an MQ scheduler, just update the scheduler
- * queue depth. This is similar to what the old code would do.
+ * Shared tags, for sched tags, we allocate max initially hence
+ * tags can't grow, see blk_mq_alloc_sched_tags().
*/
- if (hctx->sched_tags) {
- ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
- nr, true);
- } else {
- ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
- false);
+ if (q->elevator)
+ blk_mq_tag_update_sched_shared_tags(q);
+ else
+ blk_mq_tag_resize_shared_tags(set, nr);
+ } else if (!q->elevator) {
+ /*
+ * Non-shared hardware tags, nr is already checked from
+ * queue_requests_store() and tags can't grow.
+ */
+ queue_for_each_hw_ctx(q, hctx, i) {
+ if (!hctx->tags)
+ continue;
+ sbitmap_queue_resize(&hctx->tags->bitmap_tags,
+ nr - hctx->tags->nr_reserved_tags);
}
- if (ret)
- break;
- if (q->elevator && q->elevator->type->ops.depth_updated)
- q->elevator->type->ops.depth_updated(hctx);
- }
- if (!ret) {
- q->nr_requests = nr;
- if (blk_mq_is_shared_tags(set->flags)) {
- if (q->elevator)
- blk_mq_tag_update_sched_shared_tags(q);
- else
- blk_mq_tag_resize_shared_tags(set, nr);
+ } else if (nr <= q->elevator->et->nr_requests) {
+ /* Non-shared sched tags, and tags don't grow. */
+ queue_for_each_hw_ctx(q, hctx, i) {
+ if (!hctx->sched_tags)
+ continue;
+ sbitmap_queue_resize(&hctx->sched_tags->bitmap_tags,
+ nr - hctx->sched_tags->nr_reserved_tags);
}
+ } else {
+ /* Non-shared sched tags, and tags grow */
+ queue_for_each_hw_ctx(q, hctx, i)
+ hctx->sched_tags = et->tags[i];
+ old_et = q->elevator->et;
+ q->elevator->et = et;
}
- blk_mq_unquiesce_queue(q);
+ q->nr_requests = nr;
+ if (q->elevator && q->elevator->type->ops.depth_updated)
+ q->elevator->type->ops.depth_updated(q);
- return ret;
+ blk_mq_unquiesce_queue(q);
+ return old_et;
}
/*
diff --git a/block/blk-mq.h b/block/blk-mq.h
index affb2e14b56e..af42dc018808 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -6,6 +6,7 @@
#include "blk-stat.h"
struct blk_mq_tag_set;
+struct elevator_tags;
struct blk_mq_ctxs {
struct kobject kobj;
@@ -45,7 +46,9 @@ void blk_mq_submit_bio(struct bio *bio);
int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
unsigned int flags);
void blk_mq_exit_queue(struct request_queue *q);
-int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
+struct elevator_tags *blk_mq_update_nr_requests(struct request_queue *q,
+ struct elevator_tags *tags,
+ unsigned int nr);
void blk_mq_wake_waiters(struct request_queue *q);
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *,
bool);
@@ -59,7 +62,7 @@ void blk_mq_put_rq_ref(struct request *rq);
*/
void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx);
-void blk_mq_free_rq_map(struct blk_mq_tags *tags);
+void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags);
struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
unsigned int hctx_idx, unsigned int depth);
void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
@@ -110,6 +113,17 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(blk_opf_t opf,
}
/*
+ * Default to double of smaller one between hw queue_depth and
+ * 128, since we don't split into sync/async like the old code
+ * did. Additionally, this is a per-hw queue depth.
+ */
+static inline unsigned int blk_mq_default_nr_requests(
+ struct blk_mq_tag_set *set)
+{
+ return 2 * min_t(unsigned int, set->queue_depth, BLKDEV_DEFAULT_RQ);
+}
+
+/*
* sysfs helpers
*/
extern void blk_mq_sysfs_init(struct request_queue *q);
@@ -162,7 +176,7 @@ struct blk_mq_alloc_data {
struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags,
unsigned int reserved_tags, unsigned int flags, int node);
-void blk_mq_free_tags(struct blk_mq_tags *tags);
+void blk_mq_free_tags(struct blk_mq_tag_set *set, struct blk_mq_tags *tags);
unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
@@ -170,8 +184,6 @@ unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
unsigned int tag);
void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags);
-int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
- struct blk_mq_tags **tags, unsigned int depth, bool can_grow);
void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set,
unsigned int size);
void blk_mq_tag_update_sched_shared_tags(struct request_queue *q);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index d6438e6c276d..54cffaae4df4 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -56,6 +56,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
lim->max_user_wzeroes_unmap_sectors = UINT_MAX;
lim->max_hw_zone_append_sectors = UINT_MAX;
lim->max_user_discard_sectors = UINT_MAX;
+ lim->atomic_write_hw_max = UINT_MAX;
}
EXPORT_SYMBOL(blk_set_stacking_limits);
@@ -223,6 +224,27 @@ static void blk_atomic_writes_update_limits(struct queue_limits *lim)
lim->atomic_write_hw_boundary >> SECTOR_SHIFT;
}
+/*
+ * Test whether any boundary is aligned with any chunk size. Stacked
+ * devices store any stripe size in t->chunk_sectors.
+ */
+static bool blk_valid_atomic_writes_boundary(unsigned int chunk_sectors,
+ unsigned int boundary_sectors)
+{
+ if (!chunk_sectors || !boundary_sectors)
+ return true;
+
+ if (boundary_sectors > chunk_sectors &&
+ boundary_sectors % chunk_sectors)
+ return false;
+
+ if (chunk_sectors > boundary_sectors &&
+ chunk_sectors % boundary_sectors)
+ return false;
+
+ return true;
+}
+
static void blk_validate_atomic_write_limits(struct queue_limits *lim)
{
unsigned int boundary_sectors;
@@ -232,6 +254,10 @@ static void blk_validate_atomic_write_limits(struct queue_limits *lim)
if (!(lim->features & BLK_FEAT_ATOMIC_WRITES))
goto unsupported;
+ /* UINT_MAX indicates stacked limits in initial state */
+ if (lim->atomic_write_hw_max == UINT_MAX)
+ goto unsupported;
+
if (!lim->atomic_write_hw_max)
goto unsupported;
@@ -259,20 +285,9 @@ static void blk_validate_atomic_write_limits(struct queue_limits *lim)
if (WARN_ON_ONCE(lim->atomic_write_hw_max >
lim->atomic_write_hw_boundary))
goto unsupported;
- /*
- * A feature of boundary support is that it disallows bios to
- * be merged which would result in a merged request which
- * crosses either a chunk sector or atomic write HW boundary,
- * even though chunk sectors may be just set for performance.
- * For simplicity, disallow atomic writes for a chunk sector
- * which is non-zero and smaller than atomic write HW boundary.
- * Furthermore, chunk sectors must be a multiple of atomic
- * write HW boundary. Otherwise boundary support becomes
- * complicated.
- * Devices which do not conform to these rules can be dealt
- * with if and when they show up.
- */
- if (WARN_ON_ONCE(lim->chunk_sectors % boundary_sectors))
+
+ if (WARN_ON_ONCE(!blk_valid_atomic_writes_boundary(
+ lim->chunk_sectors, boundary_sectors)))
goto unsupported;
/*
@@ -639,25 +654,6 @@ static bool blk_stack_atomic_writes_tail(struct queue_limits *t,
return true;
}
-/* Check for valid boundary of first bottom device */
-static bool blk_stack_atomic_writes_boundary_head(struct queue_limits *t,
- struct queue_limits *b)
-{
- /*
- * Ensure atomic write boundary is aligned with chunk sectors. Stacked
- * devices store chunk sectors in t->io_min.
- */
- if (b->atomic_write_hw_boundary > t->io_min &&
- b->atomic_write_hw_boundary % t->io_min)
- return false;
- if (t->io_min > b->atomic_write_hw_boundary &&
- t->io_min % b->atomic_write_hw_boundary)
- return false;
-
- t->atomic_write_hw_boundary = b->atomic_write_hw_boundary;
- return true;
-}
-
static void blk_stack_atomic_writes_chunk_sectors(struct queue_limits *t)
{
unsigned int chunk_bytes;
@@ -695,13 +691,14 @@ static void blk_stack_atomic_writes_chunk_sectors(struct queue_limits *t)
static bool blk_stack_atomic_writes_head(struct queue_limits *t,
struct queue_limits *b)
{
- if (b->atomic_write_hw_boundary &&
- !blk_stack_atomic_writes_boundary_head(t, b))
+ if (!blk_valid_atomic_writes_boundary(t->chunk_sectors,
+ b->atomic_write_hw_boundary >> SECTOR_SHIFT))
return false;
t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max;
t->atomic_write_hw_unit_min = b->atomic_write_hw_unit_min;
t->atomic_write_hw_max = b->atomic_write_hw_max;
+ t->atomic_write_hw_boundary = b->atomic_write_hw_boundary;
return true;
}
@@ -717,18 +714,14 @@ static void blk_stack_atomic_writes_limits(struct queue_limits *t,
if (!blk_atomic_write_start_sect_aligned(start, b))
goto unsupported;
- /*
- * If atomic_write_hw_max is set, we have already stacked 1x bottom
- * device, so check for compliance.
- */
- if (t->atomic_write_hw_max) {
+ /* UINT_MAX indicates no stacking of bottom devices yet */
+ if (t->atomic_write_hw_max == UINT_MAX) {
+ if (!blk_stack_atomic_writes_head(t, b))
+ goto unsupported;
+ } else {
if (!blk_stack_atomic_writes_tail(t, b))
goto unsupported;
- return;
}
-
- if (!blk_stack_atomic_writes_head(t, b))
- goto unsupported;
blk_stack_atomic_writes_chunk_sectors(t);
return;
@@ -763,7 +756,8 @@ unsupported:
int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
sector_t start)
{
- unsigned int top, bottom, alignment, ret = 0;
+ unsigned int top, bottom, alignment;
+ int ret = 0;
t->features |= (b->features & BLK_FEAT_INHERIT_MASK);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index c94b8b6ab024..76c47fe9b8d6 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -64,28 +64,66 @@ static ssize_t queue_requests_show(struct gendisk *disk, char *page)
static ssize_t
queue_requests_store(struct gendisk *disk, const char *page, size_t count)
{
- unsigned long nr;
- int ret, err;
- unsigned int memflags;
struct request_queue *q = disk->queue;
-
- if (!queue_is_mq(q))
- return -EINVAL;
+ struct blk_mq_tag_set *set = q->tag_set;
+ struct elevator_tags *et = NULL;
+ unsigned int memflags;
+ unsigned long nr;
+ int ret;
ret = queue_var_store(&nr, page, count);
if (ret < 0)
return ret;
- memflags = blk_mq_freeze_queue(q);
- mutex_lock(&q->elevator_lock);
+ /*
+ * Serialize updating nr_requests with concurrent queue_requests_store()
+ * and switching elevator.
+ */
+ down_write(&set->update_nr_hwq_lock);
+
+ if (nr == q->nr_requests)
+ goto unlock;
+
if (nr < BLKDEV_MIN_RQ)
nr = BLKDEV_MIN_RQ;
- err = blk_mq_update_nr_requests(disk->queue, nr);
- if (err)
- ret = err;
+ /*
+ * Switching elevator is protected by update_nr_hwq_lock:
+ * - read lock is held from elevator sysfs attribute;
+ * - write lock is held from updating nr_hw_queues;
+ * Hence it's safe to access q->elevator here with write lock held.
+ */
+ if (nr <= set->reserved_tags ||
+ (q->elevator && nr > MAX_SCHED_RQ) ||
+ (!q->elevator && nr > set->queue_depth)) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ if (!blk_mq_is_shared_tags(set->flags) && q->elevator &&
+ nr > q->elevator->et->nr_requests) {
+ /*
+ * Tags will grow, allocate memory before freezing queue to
+ * prevent deadlock.
+ */
+ et = blk_mq_alloc_sched_tags(set, q->nr_hw_queues, nr);
+ if (!et) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+ }
+
+ memflags = blk_mq_freeze_queue(q);
+ mutex_lock(&q->elevator_lock);
+ et = blk_mq_update_nr_requests(q, et, nr);
mutex_unlock(&q->elevator_lock);
blk_mq_unfreeze_queue(q, memflags);
+
+ if (et)
+ blk_mq_free_sched_tags(et, set);
+
+unlock:
+ up_write(&set->update_nr_hwq_lock);
return ret;
}
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 397b6a410f9e..2c5b64b1a724 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1224,7 +1224,7 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
if (!bio_list_empty(&bio_list_on_stack)) {
blk_start_plug(&plug);
while ((bio = bio_list_pop(&bio_list_on_stack)))
- submit_bio_noacct_nocheck(bio);
+ submit_bio_noacct_nocheck(bio, false);
blk_finish_plug(&plug);
}
}
@@ -1327,17 +1327,13 @@ static int blk_throtl_init(struct gendisk *disk)
INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
throtl_service_queue_init(&td->service_queue);
- /*
- * Freeze queue before activating policy, to synchronize with IO path,
- * which is protected by 'q_usage_counter'.
- */
memflags = blk_mq_freeze_queue(disk->queue);
blk_mq_quiesce_queue(disk->queue);
q->td = td;
td->queue = q;
- /* activate policy */
+ /* activate policy, blk_throtl_activated() will return true */
ret = blkcg_activate_policy(disk, &blkcg_policy_throtl);
if (ret) {
q->td = NULL;
@@ -1846,12 +1842,15 @@ void blk_throtl_exit(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
- if (!blk_throtl_activated(q))
+ /*
+ * blkg_destroy_all() already deactivate throtl policy, just check and
+ * free throtl data.
+ */
+ if (!q->td)
return;
timer_delete_sync(&q->td->service_queue.pending_timer);
throtl_shutdown_wq(q);
- blkcg_deactivate_policy(disk, &blkcg_policy_throtl);
kfree(q->td);
}
diff --git a/block/blk-throttle.h b/block/blk-throttle.h
index 3b27755bfbff..9d7a42c039a1 100644
--- a/block/blk-throttle.h
+++ b/block/blk-throttle.h
@@ -156,7 +156,13 @@ void blk_throtl_cancel_bios(struct gendisk *disk);
static inline bool blk_throtl_activated(struct request_queue *q)
{
- return q->td != NULL;
+ /*
+ * q->td guarantees that the blk-throttle module is already loaded,
+ * and the plid of blk-throttle is assigned.
+ * blkcg_policy_enabled() guarantees that the policy is activated
+ * in the request_queue.
+ */
+ return q->td != NULL && blkcg_policy_enabled(q, &blkcg_policy_throtl);
}
static inline bool blk_should_throtl(struct bio *bio)
@@ -164,11 +170,6 @@ static inline bool blk_should_throtl(struct bio *bio)
struct throtl_grp *tg;
int rw = bio_data_dir(bio);
- /*
- * This is called under bio_queue_enter(), and it's synchronized with
- * the activation of blk-throtl, which is protected by
- * blk_mq_freeze_queue().
- */
if (!blk_throtl_activated(bio->bi_bdev->bd_queue))
return false;
@@ -194,7 +195,10 @@ static inline bool blk_should_throtl(struct bio *bio)
static inline bool blk_throtl_bio(struct bio *bio)
{
-
+ /*
+ * block throttling takes effect if the policy is activated
+ * in the bio's request_queue.
+ */
if (!blk_should_throtl(bio))
return false;
diff --git a/block/blk.h b/block/blk.h
index 46f566f9b126..170794632135 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -41,6 +41,7 @@ struct blk_flush_queue {
struct list_head flush_queue[2];
unsigned long flush_data_in_flight;
struct request *flush_rq;
+ struct rcu_head rcu_head;
};
bool is_flush_rq(struct request *req);
@@ -54,7 +55,7 @@ bool blk_queue_start_drain(struct request_queue *q);
bool __blk_freeze_queue_start(struct request_queue *q,
struct task_struct *owner);
int __bio_queue_enter(struct request_queue *q, struct bio *bio);
-void submit_bio_noacct_nocheck(struct bio *bio);
+void submit_bio_noacct_nocheck(struct bio *bio, bool split);
void bio_await_chain(struct bio *bio);
static inline bool blk_try_enter_queue(struct request_queue *q, bool pm)
@@ -615,6 +616,7 @@ extern const struct address_space_operations def_blk_aops;
int disk_register_independent_access_ranges(struct gendisk *disk);
void disk_unregister_independent_access_ranges(struct gendisk *disk);
+int should_fail_bio(struct bio *bio);
#ifdef CONFIG_FAIL_MAKE_REQUEST
bool should_fail_request(struct block_device *part, unsigned int bytes);
#else /* CONFIG_FAIL_MAKE_REQUEST */
@@ -680,48 +682,6 @@ static inline ktime_t blk_time_get(void)
return ns_to_ktime(blk_time_get_ns());
}
-/*
- * From most significant bit:
- * 1 bit: reserved for other usage, see below
- * 12 bits: original size of bio
- * 51 bits: issue time of bio
- */
-#define BIO_ISSUE_RES_BITS 1
-#define BIO_ISSUE_SIZE_BITS 12
-#define BIO_ISSUE_RES_SHIFT (64 - BIO_ISSUE_RES_BITS)
-#define BIO_ISSUE_SIZE_SHIFT (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS)
-#define BIO_ISSUE_TIME_MASK ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1)
-#define BIO_ISSUE_SIZE_MASK \
- (((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT)
-#define BIO_ISSUE_RES_MASK (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1))
-
-/* Reserved bit for blk-throtl */
-#define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63)
-
-static inline u64 __bio_issue_time(u64 time)
-{
- return time & BIO_ISSUE_TIME_MASK;
-}
-
-static inline u64 bio_issue_time(struct bio_issue *issue)
-{
- return __bio_issue_time(issue->value);
-}
-
-static inline sector_t bio_issue_size(struct bio_issue *issue)
-{
- return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT);
-}
-
-static inline void bio_issue_init(struct bio_issue *issue,
- sector_t size)
-{
- size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1;
- issue->value = ((issue->value & BIO_ISSUE_RES_MASK) |
- (blk_time_get_ns() & BIO_ISSUE_TIME_MASK) |
- ((u64)size << BIO_ISSUE_SIZE_SHIFT));
-}
-
void bdev_release(struct file *bdev_file);
int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
const struct blk_holder_ops *hops, struct file *bdev_file);
diff --git a/block/elevator.c b/block/elevator.c
index fe96c6f4753c..e2ebfbf107b3 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -669,7 +669,8 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
lockdep_assert_held(&set->update_nr_hwq_lock);
if (strncmp(ctx->name, "none", 4)) {
- ctx->et = blk_mq_alloc_sched_tags(set, set->nr_hw_queues);
+ ctx->et = blk_mq_alloc_sched_tags(set, set->nr_hw_queues,
+ blk_mq_default_nr_requests(set));
if (!ctx->et)
return -ENOMEM;
}
diff --git a/block/elevator.h b/block/elevator.h
index adc5c157e17e..c4d20155065e 100644
--- a/block/elevator.h
+++ b/block/elevator.h
@@ -37,7 +37,7 @@ struct elevator_mq_ops {
void (*exit_sched)(struct elevator_queue *);
int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int);
void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int);
- void (*depth_updated)(struct blk_mq_hw_ctx *);
+ void (*depth_updated)(struct request_queue *);
bool (*allow_merge)(struct request_queue *, struct request *, struct bio *);
bool (*bio_merge)(struct request_queue *, struct bio *, unsigned int);
diff --git a/block/fops.c b/block/fops.c
index 82451ac8ff25..19814bddd77e 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -38,8 +38,8 @@ static blk_opf_t dio_bio_write_op(struct kiocb *iocb)
static bool blkdev_dio_invalid(struct block_device *bdev, struct kiocb *iocb,
struct iov_iter *iter)
{
- return iocb->ki_pos & (bdev_logical_block_size(bdev) - 1) ||
- !bdev_iter_is_aligned(bdev, iter);
+ return (iocb->ki_pos | iov_iter_count(iter)) &
+ (bdev_logical_block_size(bdev) - 1);
}
#define DIO_INLINE_BIO_VECS 4
@@ -78,7 +78,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
if (iocb->ki_flags & IOCB_ATOMIC)
bio.bi_opf |= REQ_ATOMIC;
- ret = bio_iov_iter_get_pages(&bio, iter);
+ ret = bio_iov_iter_get_bdev_pages(&bio, iter, bdev);
if (unlikely(ret))
goto out;
ret = bio.bi_iter.bi_size;
@@ -212,7 +212,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
bio->bi_end_io = blkdev_bio_end_io;
bio->bi_ioprio = iocb->ki_ioprio;
- ret = bio_iov_iter_get_pages(bio, iter);
+ ret = bio_iov_iter_get_bdev_pages(bio, iter, bdev);
if (unlikely(ret)) {
bio->bi_status = BLK_STS_IOERR;
bio_endio(bio);
@@ -348,7 +348,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
*/
bio_iov_bvec_set(bio, iter);
} else {
- ret = bio_iov_iter_get_pages(bio, iter);
+ ret = bio_iov_iter_get_bdev_pages(bio, iter, bdev);
if (unlikely(ret))
goto out_bio_put;
}
diff --git a/block/ioctl.c b/block/ioctl.c
index f7b0006ca45d..a14304f737c5 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -481,7 +481,7 @@ static int blkdev_getgeo(struct block_device *bdev,
*/
memset(&geo, 0, sizeof(geo));
geo.start = get_start_sect(bdev);
- ret = disk->fops->getgeo(bdev, &geo);
+ ret = disk->fops->getgeo(disk, &geo);
if (ret)
return ret;
if (copy_to_user(argp, &geo, sizeof(geo)))
@@ -515,7 +515,7 @@ static int compat_hdio_getgeo(struct block_device *bdev,
* want to override it.
*/
geo.start = get_start_sect(bdev);
- ret = disk->fops->getgeo(bdev, &geo);
+ ret = disk->fops->getgeo(disk, &geo);
if (ret)
return ret;
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index 70cbc7b2deb4..18efd6ef2a2b 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -399,6 +399,14 @@ err:
return ERR_PTR(ret);
}
+static void kyber_depth_updated(struct request_queue *q)
+{
+ struct kyber_queue_data *kqd = q->elevator->elevator_data;
+
+ kqd->async_depth = q->nr_requests * KYBER_ASYNC_PERCENT / 100U;
+ blk_mq_set_min_shallow_depth(q, kqd->async_depth);
+}
+
static int kyber_init_sched(struct request_queue *q, struct elevator_queue *eq)
{
struct kyber_queue_data *kqd;
@@ -413,6 +421,7 @@ static int kyber_init_sched(struct request_queue *q, struct elevator_queue *eq)
eq->elevator_data = kqd;
q->elevator = eq;
+ kyber_depth_updated(q);
return 0;
}
@@ -440,15 +449,6 @@ static void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq)
INIT_LIST_HEAD(&kcq->rq_list[i]);
}
-static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx)
-{
- struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
- struct blk_mq_tags *tags = hctx->sched_tags;
-
- kqd->async_depth = hctx->queue->nr_requests * KYBER_ASYNC_PERCENT / 100U;
- sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, kqd->async_depth);
-}
-
static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
{
struct kyber_hctx_data *khd;
@@ -493,7 +493,6 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
khd->batching = 0;
hctx->sched_data = khd;
- kyber_depth_updated(hctx);
return 0;
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index b9b7cdf1d3c9..3e741d33142d 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -136,10 +136,6 @@ static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio,
struct rb_node *node = per_prio->sort_list[data_dir].rb_node;
struct request *rq, *res = NULL;
- if (!node)
- return NULL;
-
- rq = rb_entry_rq(node);
while (node) {
rq = rb_entry_rq(node);
if (blk_rq_pos(rq) >= pos) {
@@ -507,22 +503,12 @@ static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
}
/* Called by blk_mq_update_nr_requests(). */
-static void dd_depth_updated(struct blk_mq_hw_ctx *hctx)
+static void dd_depth_updated(struct request_queue *q)
{
- struct request_queue *q = hctx->queue;
struct deadline_data *dd = q->elevator->elevator_data;
- struct blk_mq_tags *tags = hctx->sched_tags;
dd->async_depth = q->nr_requests;
-
- sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1);
-}
-
-/* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */
-static int dd_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
-{
- dd_depth_updated(hctx);
- return 0;
+ blk_mq_set_min_shallow_depth(q, 1);
}
static void dd_exit_sched(struct elevator_queue *e)
@@ -587,6 +573,7 @@ static int dd_init_sched(struct request_queue *q, struct elevator_queue *eq)
blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
q->elevator = eq;
+ dd_depth_updated(q);
return 0;
}
@@ -1048,7 +1035,6 @@ static struct elevator_type mq_deadline = {
.has_work = dd_has_work,
.init_sched = dd_init_sched,
.exit_sched = dd_exit_sched,
- .init_hctx = dd_init_hctx,
},
#ifdef CONFIG_BLK_DEBUG_FS
diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c
index 82d9c4c3fb41..631291fbb356 100644
--- a/block/partitions/ibm.c
+++ b/block/partitions/ibm.c
@@ -358,7 +358,7 @@ int ibm_partition(struct parsed_partitions *state)
goto out_nolab;
/* set start if not filled by getgeo function e.g. virtblk */
geo->start = get_start_sect(bdev);
- if (disk->fops->getgeo(bdev, geo))
+ if (disk->fops->getgeo(disk, geo))
goto out_freeall;
if (!fn || fn(disk, info)) {
kfree(info);
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 2ded5e476d6e..b43a3196e2be 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -351,7 +351,7 @@ EXPORT_SYMBOL_GPL(ata_common_sdev_groups);
/**
* ata_std_bios_param - generic bios head/sector/cylinder calculator used by sd.
* @sdev: SCSI device for which BIOS geometry is to be determined
- * @bdev: block device associated with @sdev
+ * @unused: gendisk associated with @sdev
* @capacity: capacity of SCSI device
* @geom: location to which geometry will be output
*
@@ -366,7 +366,7 @@ EXPORT_SYMBOL_GPL(ata_common_sdev_groups);
* RETURNS:
* Zero.
*/
-int ata_std_bios_param(struct scsi_device *sdev, struct block_device *bdev,
+int ata_std_bios_param(struct scsi_device *sdev, struct gendisk *unused,
sector_t capacity, int geom[])
{
geom[0] = 255;
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 6357d86eafdc..2932b6653b6f 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1523,13 +1523,13 @@ static blk_status_t amiflop_queue_rq(struct blk_mq_hw_ctx *hctx,
return BLK_STS_OK;
}
-static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+static int fd_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
- int drive = MINOR(bdev->bd_dev) & 3;
+ struct amiga_floppy_struct *p = disk->private_data;
- geo->heads = unit[drive].type->heads;
- geo->sectors = unit[drive].dtype->sects * unit[drive].type->sect_mult;
- geo->cylinders = unit[drive].type->tracks;
+ geo->heads = p->type->heads;
+ geo->sectors = p->dtype->sects * p->type->sect_mult;
+ geo->cylinders = p->type->tracks;
return 0;
}
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 00b74a845328..34ead75e7e02 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -269,9 +269,9 @@ static blk_status_t aoeblk_queue_rq(struct blk_mq_hw_ctx *hctx,
}
static int
-aoeblk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+aoeblk_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
- struct aoedev *d = bdev->bd_disk->private_data;
+ struct aoedev *d = disk->private_data;
if ((d->flags & DEVFL_UP) == 0) {
printk(KERN_ERR "aoe: disk not up\n");
diff --git a/drivers/block/aoe/aoemain.c b/drivers/block/aoe/aoemain.c
index cdf6e4041bb9..3b21750038ee 100644
--- a/drivers/block/aoe/aoemain.c
+++ b/drivers/block/aoe/aoemain.c
@@ -44,7 +44,7 @@ aoe_init(void)
{
int ret;
- aoe_wq = alloc_workqueue("aoe_wq", 0, 0);
+ aoe_wq = alloc_workqueue("aoe_wq", WQ_PERCPU, 0);
if (!aoe_wq)
return -ENOMEM;
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 61d62019a38d..5336c3c5ca36 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -3090,16 +3090,13 @@ static int raw_cmd_copyin(int cmd, void __user *param,
*rcmd = NULL;
loop:
- ptr = kmalloc(sizeof(struct floppy_raw_cmd), GFP_KERNEL);
- if (!ptr)
- return -ENOMEM;
+ ptr = memdup_user(param, sizeof(*ptr));
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
*rcmd = ptr;
- ret = copy_from_user(ptr, param, sizeof(*ptr));
ptr->next = NULL;
ptr->buffer_length = 0;
ptr->kernel_data = NULL;
- if (ret)
- return -EFAULT;
param += sizeof(struct floppy_raw_cmd);
if (ptr->cmd_count > FD_RAW_CMD_FULLSIZE)
return -EINVAL;
@@ -3361,9 +3358,9 @@ static int get_floppy_geometry(int drive, int type, struct floppy_struct **g)
return 0;
}
-static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+static int fd_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
- int drive = (long)bdev->bd_disk->private_data;
+ int drive = (long)disk->private_data;
int type = ITYPE(drive_state[drive].fd_device);
struct floppy_struct *g;
int ret;
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 8fc7761397bd..567192e371a8 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -3148,17 +3148,17 @@ static int mtip_block_compat_ioctl(struct block_device *dev,
* that each partition is also 4KB aligned. Non-aligned partitions adversely
* affects performance.
*
- * @dev Pointer to the block_device strucutre.
+ * @disk Pointer to the gendisk strucutre.
* @geo Pointer to a hd_geometry structure.
*
* return value
* 0 Operation completed successfully.
* -ENOTTY An error occurred while reading the drive capacity.
*/
-static int mtip_block_getgeo(struct block_device *dev,
+static int mtip_block_getgeo(struct gendisk *disk,
struct hd_geometry *geo)
{
- struct driver_data *dd = dev->bd_disk->private_data;
+ struct driver_data *dd = disk->private_data;
sector_t capacity;
if (!dd)
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 6463d0e8d0ce..1188f32a5e5e 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -311,7 +311,7 @@ static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
if (args) {
INIT_WORK(&args->work, nbd_dead_link_work);
args->index = nbd->index;
- queue_work(system_wq, &args->work);
+ queue_work(system_percpu_wq, &args->work);
}
}
if (!nsock->dead) {
@@ -1217,6 +1217,14 @@ static struct socket *nbd_get_socket(struct nbd_device *nbd, unsigned long fd,
if (!sock)
return NULL;
+ if (!sk_is_tcp(sock->sk) &&
+ !sk_is_stream_unix(sock->sk)) {
+ dev_err(disk_to_dev(nbd->disk), "Unsupported socket: should be TCP or UNIX.\n");
+ *err = -EINVAL;
+ sockfd_put(sock);
+ return NULL;
+ }
+
if (sock->ops->shutdown == sock_no_shutdown) {
dev_err(disk_to_dev(nbd->disk), "Unsupported socket: shutdown callout must be supported.\n");
*err = -EINVAL;
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index 91642c9a3b29..f982027e8c85 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -223,7 +223,7 @@ MODULE_PARM_DESC(discard, "Support discard operations (requires memory-backed nu
static unsigned long g_cache_size;
module_param_named(cache_size, g_cache_size, ulong, 0444);
-MODULE_PARM_DESC(mbps, "Cache size in MiB for memory-backed device. Default: 0 (none)");
+MODULE_PARM_DESC(cache_size, "Cache size in MiB for memory-backed device. Default: 0 (none)");
static bool g_fua = true;
module_param_named(fua, g_fua, bool, 0444);
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index faafd7ff43d6..af0e21149dbc 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -7389,7 +7389,7 @@ static int __init rbd_init(void)
* The number of active work items is limited by the number of
* rbd devices * queue depth, so leave @max_active at default.
*/
- rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
+ rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM | WQ_PERCPU, 0);
if (!rbd_wq) {
rc = -ENOMEM;
goto err_out_slab;
diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c
index 15627417f12e..f1409e54010a 100644
--- a/drivers/block/rnbd/rnbd-clt.c
+++ b/drivers/block/rnbd/rnbd-clt.c
@@ -942,11 +942,11 @@ static void rnbd_client_release(struct gendisk *gen)
rnbd_clt_put_dev(dev);
}
-static int rnbd_client_getgeo(struct block_device *block_device,
+static int rnbd_client_getgeo(struct gendisk *disk,
struct hd_geometry *geo)
{
u64 size;
- struct rnbd_clt_dev *dev = block_device->bd_disk->private_data;
+ struct rnbd_clt_dev *dev = disk->private_data;
struct queue_limits *limit = &dev->queue->limits;
size = dev->size * (limit->logical_block_size / SECTOR_SIZE);
@@ -1809,7 +1809,7 @@ static int __init rnbd_client_init(void)
unregister_blkdev(rnbd_client_major, "rnbd");
return err;
}
- rnbd_clt_wq = alloc_workqueue("rnbd_clt_wq", 0, 0);
+ rnbd_clt_wq = alloc_workqueue("rnbd_clt_wq", WQ_PERCPU, 0);
if (!rnbd_clt_wq) {
pr_err("Failed to load module, alloc_workqueue failed.\n");
rnbd_clt_destroy_sysfs_files();
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index 7af21fe67671..db1fe9772a4d 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -119,9 +119,8 @@ static inline u32 vdc_tx_dring_avail(struct vio_dring_state *dr)
return vio_dring_avail(dr, VDC_TX_RING_SIZE);
}
-static int vdc_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+static int vdc_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
- struct gendisk *disk = bdev->bd_disk;
sector_t nsect = get_capacity(disk);
sector_t cylinders = nsect;
@@ -1189,7 +1188,7 @@ static void vdc_ldc_reset(struct vdc_port *port)
}
if (port->ldc_timeout)
- mod_delayed_work(system_wq, &port->ldc_reset_timer_work,
+ mod_delayed_work(system_percpu_wq, &port->ldc_reset_timer_work,
round_jiffies(jiffies + HZ * port->ldc_timeout));
mod_timer(&port->vio.timer, round_jiffies(jiffies + HZ));
return;
@@ -1217,7 +1216,7 @@ static int __init vdc_init(void)
{
int err;
- sunvdc_wq = alloc_workqueue("sunvdc", 0, 0);
+ sunvdc_wq = alloc_workqueue("sunvdc", WQ_PERCPU, 0);
if (!sunvdc_wq)
return -ENOMEM;
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index eda33c5eb5e2..416015947ae6 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -711,9 +711,9 @@ static int floppy_ioctl(struct block_device *bdev, blk_mode_t mode,
return -ENOTTY;
}
-static int floppy_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+static int floppy_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
- struct floppy_state *fs = bdev->bd_disk->private_data;
+ struct floppy_state *fs = disk->private_data;
struct floppy_struct *g;
int ret;
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 99abd67b708b..5ab7ff5f03f4 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -201,7 +201,6 @@ struct ublk_queue {
bool force_abort;
bool canceling;
bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */
- unsigned short nr_io_ready; /* how many ios setup */
spinlock_t cancel_lock;
struct ublk_device *dev;
struct ublk_io ios[];
@@ -234,7 +233,7 @@ struct ublk_device {
struct ublk_params params;
struct completion completion;
- unsigned int nr_queues_ready;
+ u32 nr_io_ready;
bool unprivileged_daemons;
struct mutex cancel_mutex;
bool canceling;
@@ -251,8 +250,7 @@ static void ublk_io_release(void *priv);
static void ublk_stop_dev_unlocked(struct ublk_device *ub);
static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
- const struct ublk_queue *ubq, struct ublk_io *io,
- size_t offset);
+ u16 q_id, u16 tag, struct ublk_io *io, size_t offset);
static inline unsigned int ublk_req_build_flags(struct request *req);
static inline struct ublksrv_io_desc *
@@ -531,7 +529,8 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
#endif
-static inline void __ublk_complete_rq(struct request *req);
+static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
+ bool need_map);
static dev_t ublk_chr_devt;
static const struct class ublk_chr_class = {
@@ -663,22 +662,44 @@ static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq)
return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY;
}
+static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub)
+{
+ return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY;
+}
+
static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq)
{
return ubq->flags & UBLK_F_AUTO_BUF_REG;
}
+static inline bool ublk_dev_support_auto_buf_reg(const struct ublk_device *ub)
+{
+ return ub->dev_info.flags & UBLK_F_AUTO_BUF_REG;
+}
+
static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
{
return ubq->flags & UBLK_F_USER_COPY;
}
+static inline bool ublk_dev_support_user_copy(const struct ublk_device *ub)
+{
+ return ub->dev_info.flags & UBLK_F_USER_COPY;
+}
+
static inline bool ublk_need_map_io(const struct ublk_queue *ubq)
{
return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) &&
!ublk_support_auto_buf_reg(ubq);
}
+static inline bool ublk_dev_need_map_io(const struct ublk_device *ub)
+{
+ return !ublk_dev_support_user_copy(ub) &&
+ !ublk_dev_support_zero_copy(ub) &&
+ !ublk_dev_support_auto_buf_reg(ub);
+}
+
static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
{
/*
@@ -696,6 +717,13 @@ static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
ublk_support_auto_buf_reg(ubq);
}
+static inline bool ublk_dev_need_req_ref(const struct ublk_device *ub)
+{
+ return ublk_dev_support_user_copy(ub) ||
+ ublk_dev_support_zero_copy(ub) ||
+ ublk_dev_support_auto_buf_reg(ub);
+}
+
static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
struct ublk_io *io)
{
@@ -710,8 +738,11 @@ static inline bool ublk_get_req_ref(struct ublk_io *io)
static inline void ublk_put_req_ref(struct ublk_io *io, struct request *req)
{
- if (refcount_dec_and_test(&io->ref))
- __ublk_complete_rq(req);
+ if (!refcount_dec_and_test(&io->ref))
+ return;
+
+ /* ublk_need_map_io() and ublk_need_req_ref() are mutually exclusive */
+ __ublk_complete_rq(req, io, false);
}
static inline bool ublk_sub_req_ref(struct ublk_io *io)
@@ -727,6 +758,11 @@ static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
return ubq->flags & UBLK_F_NEED_GET_DATA;
}
+static inline bool ublk_dev_need_get_data(const struct ublk_device *ub)
+{
+ return ub->dev_info.flags & UBLK_F_NEED_GET_DATA;
+}
+
/* Called in slow path only, keep it noinline for trace purpose */
static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub)
{
@@ -763,11 +799,9 @@ static inline int __ublk_queue_cmd_buf_size(int depth)
return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE);
}
-static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id)
+static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub)
{
- struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
-
- return __ublk_queue_cmd_buf_size(ubq->q_depth);
+ return __ublk_queue_cmd_buf_size(ub->dev_info.queue_depth);
}
static int ublk_max_cmd_buf_size(void)
@@ -1018,13 +1052,13 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
return rq_bytes;
}
-static int ublk_unmap_io(const struct ublk_queue *ubq,
+static int ublk_unmap_io(bool need_map,
const struct request *req,
const struct ublk_io *io)
{
const unsigned int rq_bytes = blk_rq_bytes(req);
- if (!ublk_need_map_io(ubq))
+ if (!need_map)
return rq_bytes;
if (ublk_need_unmap_req(req)) {
@@ -1116,10 +1150,9 @@ static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
}
/* todo: handle partial completion */
-static inline void __ublk_complete_rq(struct request *req)
+static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
+ bool need_map)
{
- struct ublk_queue *ubq = req->mq_hctx->driver_data;
- struct ublk_io *io = &ubq->ios[req->tag];
unsigned int unmapped_bytes;
blk_status_t res = BLK_STS_OK;
@@ -1143,7 +1176,7 @@ static inline void __ublk_complete_rq(struct request *req)
goto exit;
/* for READ request, writing data in iod->addr to rq buffers */
- unmapped_bytes = ublk_unmap_io(ubq, req, io);
+ unmapped_bytes = ublk_unmap_io(need_map, req, io);
/*
* Extremely impossible since we got data filled in just before
@@ -1499,9 +1532,6 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
{
int i;
- /* All old ioucmds have to be completed */
- ubq->nr_io_ready = 0;
-
for (i = 0; i < ubq->q_depth; i++) {
struct ublk_io *io = &ubq->ios[i];
@@ -1550,7 +1580,7 @@ static void ublk_reset_ch_dev(struct ublk_device *ub)
/* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */
ub->mm = NULL;
- ub->nr_queues_ready = 0;
+ ub->nr_io_ready = 0;
ub->unprivileged_daemons = false;
ub->ublksrv_tgid = -1;
}
@@ -1707,23 +1737,23 @@ static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
__func__, q_id, current->pid, vma->vm_start,
phys_off, (unsigned long)sz);
- if (sz != ublk_queue_cmd_buf_size(ub, q_id))
+ if (sz != ublk_queue_cmd_buf_size(ub))
return -EINVAL;
pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
}
-static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
+static void __ublk_fail_req(struct ublk_device *ub, struct ublk_io *io,
struct request *req)
{
WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
- if (ublk_nosrv_should_reissue_outstanding(ubq->dev))
+ if (ublk_nosrv_should_reissue_outstanding(ub))
blk_mq_requeue_request(req, false);
else {
io->res = -EIO;
- __ublk_complete_rq(req);
+ __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub));
}
}
@@ -1743,7 +1773,7 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
struct ublk_io *io = &ubq->ios[i];
if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
- __ublk_fail_req(ubq, io, io->req);
+ __ublk_fail_req(ub, io, io->req);
}
}
@@ -1848,9 +1878,11 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
ublk_cancel_cmd(ubq, pdu->tag, issue_flags);
}
-static inline bool ublk_queue_ready(struct ublk_queue *ubq)
+static inline bool ublk_dev_ready(const struct ublk_device *ub)
{
- return ubq->nr_io_ready == ubq->q_depth;
+ u32 total = (u32)ub->dev_info.nr_hw_queues * ub->dev_info.queue_depth;
+
+ return ub->nr_io_ready == total;
}
static void ublk_cancel_queue(struct ublk_queue *ubq)
@@ -1974,16 +2006,14 @@ static void ublk_reset_io_flags(struct ublk_device *ub)
}
/* device can only be started after all IOs are ready */
-static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
+static void ublk_mark_io_ready(struct ublk_device *ub)
__must_hold(&ub->mutex)
{
- ubq->nr_io_ready++;
- if (ublk_queue_ready(ubq))
- ub->nr_queues_ready++;
if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN))
ub->unprivileged_daemons = true;
- if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues) {
+ ub->nr_io_ready++;
+ if (ublk_dev_ready(ub)) {
/* now we are ready for handling ublk io request */
ublk_reset_io_flags(ub);
complete_all(&ub->completion);
@@ -2054,11 +2084,11 @@ ublk_fill_io_cmd(struct ublk_io *io, struct io_uring_cmd *cmd)
}
static inline int
-ublk_config_io_buf(const struct ublk_queue *ubq, struct ublk_io *io,
+ublk_config_io_buf(const struct ublk_device *ub, struct ublk_io *io,
struct io_uring_cmd *cmd, unsigned long buf_addr,
u16 *buf_idx)
{
- if (ublk_support_auto_buf_reg(ubq))
+ if (ublk_dev_support_auto_buf_reg(ub))
return ublk_handle_auto_buf_reg(io, cmd, buf_idx);
io->addr = buf_addr;
@@ -2097,18 +2127,18 @@ static void ublk_io_release(void *priv)
}
static int ublk_register_io_buf(struct io_uring_cmd *cmd,
- const struct ublk_queue *ubq,
+ struct ublk_device *ub,
+ u16 q_id, u16 tag,
struct ublk_io *io,
unsigned int index, unsigned int issue_flags)
{
- struct ublk_device *ub = cmd->file->private_data;
struct request *req;
int ret;
- if (!ublk_support_zero_copy(ubq))
+ if (!ublk_dev_support_zero_copy(ub))
return -EINVAL;
- req = __ublk_check_and_get_req(ub, ubq, io, 0);
+ req = __ublk_check_and_get_req(ub, q_id, tag, io, 0);
if (!req)
return -EINVAL;
@@ -2124,7 +2154,8 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
static int
ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
- const struct ublk_queue *ubq, struct ublk_io *io,
+ struct ublk_device *ub,
+ u16 q_id, u16 tag, struct ublk_io *io,
unsigned index, unsigned issue_flags)
{
unsigned new_registered_buffers;
@@ -2137,9 +2168,10 @@ ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
*/
new_registered_buffers = io->task_registered_buffers + 1;
if (unlikely(new_registered_buffers >= UBLK_REFCOUNT_INIT))
- return ublk_register_io_buf(cmd, ubq, io, index, issue_flags);
+ return ublk_register_io_buf(cmd, ub, q_id, tag, io, index,
+ issue_flags);
- if (!ublk_support_zero_copy(ubq) || !ublk_rq_has_data(req))
+ if (!ublk_dev_support_zero_copy(ub) || !ublk_rq_has_data(req))
return -EINVAL;
ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
@@ -2161,14 +2193,14 @@ static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
return io_buffer_unregister_bvec(cmd, index, issue_flags);
}
-static int ublk_check_fetch_buf(const struct ublk_queue *ubq, __u64 buf_addr)
+static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr)
{
- if (ublk_need_map_io(ubq)) {
+ if (ublk_dev_need_map_io(ub)) {
/*
* FETCH_RQ has to provide IO buffer if NEED GET
* DATA is not enabled
*/
- if (!buf_addr && !ublk_need_get_data(ubq))
+ if (!buf_addr && !ublk_dev_need_get_data(ub))
return -EINVAL;
} else if (buf_addr) {
/* User copy requires addr to be unset */
@@ -2177,10 +2209,9 @@ static int ublk_check_fetch_buf(const struct ublk_queue *ubq, __u64 buf_addr)
return 0;
}
-static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq,
+static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
struct ublk_io *io, __u64 buf_addr)
{
- struct ublk_device *ub = ubq->dev;
int ret = 0;
/*
@@ -2189,8 +2220,8 @@ static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq,
* FETCH, so it is fine even for IO_URING_F_NONBLOCK.
*/
mutex_lock(&ub->mutex);
- /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */
- if (ublk_queue_ready(ubq)) {
+ /* UBLK_IO_FETCH_REQ is only allowed before dev is setup */
+ if (ublk_dev_ready(ub)) {
ret = -EBUSY;
goto out;
}
@@ -2204,28 +2235,28 @@ static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq,
WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV);
ublk_fill_io_cmd(io, cmd);
- ret = ublk_config_io_buf(ubq, io, cmd, buf_addr, NULL);
+ ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL);
if (ret)
goto out;
WRITE_ONCE(io->task, get_task_struct(current));
- ublk_mark_io_ready(ub, ubq);
+ ublk_mark_io_ready(ub);
out:
mutex_unlock(&ub->mutex);
return ret;
}
-static int ublk_check_commit_and_fetch(const struct ublk_queue *ubq,
+static int ublk_check_commit_and_fetch(const struct ublk_device *ub,
struct ublk_io *io, __u64 buf_addr)
{
struct request *req = io->req;
- if (ublk_need_map_io(ubq)) {
+ if (ublk_dev_need_map_io(ub)) {
/*
* COMMIT_AND_FETCH_REQ has to provide IO buffer if
* NEED GET DATA is not enabled or it is Read IO.
*/
- if (!buf_addr && (!ublk_need_get_data(ubq) ||
+ if (!buf_addr && (!ublk_dev_need_get_data(ub) ||
req_op(req) == REQ_OP_READ))
return -EINVAL;
} else if (req_op(req) != REQ_OP_ZONE_APPEND && buf_addr) {
@@ -2239,10 +2270,10 @@ static int ublk_check_commit_and_fetch(const struct ublk_queue *ubq,
return 0;
}
-static bool ublk_need_complete_req(const struct ublk_queue *ubq,
+static bool ublk_need_complete_req(const struct ublk_device *ub,
struct ublk_io *io)
{
- if (ublk_need_req_ref(ubq))
+ if (ublk_dev_need_req_ref(ub))
return ublk_sub_req_ref(io);
return true;
}
@@ -2265,23 +2296,28 @@ static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io,
return ublk_start_io(ubq, req, io);
}
-static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
- unsigned int issue_flags,
- const struct ublksrv_io_cmd *ub_cmd)
+static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
{
+ /* May point to userspace-mapped memory */
+ const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe);
u16 buf_idx = UBLK_INVALID_BUF_IDX;
struct ublk_device *ub = cmd->file->private_data;
struct ublk_queue *ubq;
struct ublk_io *io;
u32 cmd_op = cmd->cmd_op;
- unsigned tag = ub_cmd->tag;
+ u16 q_id = READ_ONCE(ub_src->q_id);
+ u16 tag = READ_ONCE(ub_src->tag);
+ s32 result = READ_ONCE(ub_src->result);
+ u64 addr = READ_ONCE(ub_src->addr); /* unioned with zone_append_lba */
struct request *req;
int ret;
bool compl;
+ WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED);
+
pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
- __func__, cmd->cmd_op, ub_cmd->q_id, tag,
- ub_cmd->result);
+ __func__, cmd->cmd_op, q_id, tag, result);
ret = ublk_check_cmd_op(cmd_op);
if (ret)
@@ -2292,25 +2328,24 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
* so no need to validate the q_id, tag, or task
*/
if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF)
- return ublk_unregister_io_buf(cmd, ub, ub_cmd->addr,
- issue_flags);
+ return ublk_unregister_io_buf(cmd, ub, addr, issue_flags);
ret = -EINVAL;
- if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues)
+ if (q_id >= ub->dev_info.nr_hw_queues)
goto out;
- ubq = ublk_get_queue(ub, ub_cmd->q_id);
+ ubq = ublk_get_queue(ub, q_id);
- if (tag >= ubq->q_depth)
+ if (tag >= ub->dev_info.queue_depth)
goto out;
io = &ubq->ios[tag];
/* UBLK_IO_FETCH_REQ can be handled on any task, which sets io->task */
if (unlikely(_IOC_NR(cmd_op) == UBLK_IO_FETCH_REQ)) {
- ret = ublk_check_fetch_buf(ubq, ub_cmd->addr);
+ ret = ublk_check_fetch_buf(ub, addr);
if (ret)
goto out;
- ret = ublk_fetch(cmd, ubq, io, ub_cmd->addr);
+ ret = ublk_fetch(cmd, ub, io, addr);
if (ret)
goto out;
@@ -2324,8 +2359,8 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
* so can be handled on any task
*/
if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF)
- return ublk_register_io_buf(cmd, ubq, io, ub_cmd->addr,
- issue_flags);
+ return ublk_register_io_buf(cmd, ub, q_id, tag, io,
+ addr, issue_flags);
goto out;
}
@@ -2346,24 +2381,24 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
switch (_IOC_NR(cmd_op)) {
case UBLK_IO_REGISTER_IO_BUF:
- return ublk_daemon_register_io_buf(cmd, ubq, io, ub_cmd->addr,
+ return ublk_daemon_register_io_buf(cmd, ub, q_id, tag, io, addr,
issue_flags);
case UBLK_IO_COMMIT_AND_FETCH_REQ:
- ret = ublk_check_commit_and_fetch(ubq, io, ub_cmd->addr);
+ ret = ublk_check_commit_and_fetch(ub, io, addr);
if (ret)
goto out;
- io->res = ub_cmd->result;
+ io->res = result;
req = ublk_fill_io_cmd(io, cmd);
- ret = ublk_config_io_buf(ubq, io, cmd, ub_cmd->addr, &buf_idx);
- compl = ublk_need_complete_req(ubq, io);
+ ret = ublk_config_io_buf(ub, io, cmd, addr, &buf_idx);
+ compl = ublk_need_complete_req(ub, io);
/* can't touch 'ublk_io' any more */
if (buf_idx != UBLK_INVALID_BUF_IDX)
io_buffer_unregister_bvec(cmd, buf_idx, issue_flags);
if (req_op(req) == REQ_OP_ZONE_APPEND)
- req->__sector = ub_cmd->zone_append_lba;
+ req->__sector = addr;
if (compl)
- __ublk_complete_rq(req);
+ __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub));
if (ret)
goto out;
@@ -2375,7 +2410,7 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
* request
*/
req = ublk_fill_io_cmd(io, cmd);
- ret = ublk_config_io_buf(ubq, io, cmd, ub_cmd->addr, NULL);
+ ret = ublk_config_io_buf(ub, io, cmd, addr, NULL);
WARN_ON_ONCE(ret);
if (likely(ublk_get_data(ubq, io, req))) {
__ublk_prep_compl_io_cmd(io, req);
@@ -2395,16 +2430,15 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
}
static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
- const struct ublk_queue *ubq, struct ublk_io *io, size_t offset)
+ u16 q_id, u16 tag, struct ublk_io *io, size_t offset)
{
- unsigned tag = io - ubq->ios;
struct request *req;
/*
* can't use io->req in case of concurrent UBLK_IO_COMMIT_AND_FETCH_REQ,
* which would overwrite it with io->cmd
*/
- req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
+ req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
if (!req)
return NULL;
@@ -2426,26 +2460,6 @@ fail_put:
return NULL;
}
-static inline int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
- unsigned int issue_flags)
-{
- /*
- * Not necessary for async retry, but let's keep it simple and always
- * copy the values to avoid any potential reuse.
- */
- const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe);
- const struct ublksrv_io_cmd ub_cmd = {
- .q_id = READ_ONCE(ub_src->q_id),
- .tag = READ_ONCE(ub_src->tag),
- .result = READ_ONCE(ub_src->result),
- .addr = READ_ONCE(ub_src->addr)
- };
-
- WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED);
-
- return __ublk_ch_uring_cmd(cmd, issue_flags, &ub_cmd);
-}
-
static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd,
unsigned int issue_flags)
{
@@ -2515,17 +2529,14 @@ static struct request *ublk_check_and_get_req(struct kiocb *iocb,
return ERR_PTR(-EINVAL);
ubq = ublk_get_queue(ub, q_id);
- if (!ubq)
- return ERR_PTR(-EINVAL);
-
- if (!ublk_support_user_copy(ubq))
+ if (!ublk_dev_support_user_copy(ub))
return ERR_PTR(-EACCES);
- if (tag >= ubq->q_depth)
+ if (tag >= ub->dev_info.queue_depth)
return ERR_PTR(-EINVAL);
*io = &ubq->ios[tag];
- req = __ublk_check_and_get_req(ub, ubq, *io, buf_off);
+ req = __ublk_check_and_get_req(ub, q_id, tag, *io, buf_off);
if (!req)
return ERR_PTR(-EINVAL);
@@ -2588,7 +2599,7 @@ static const struct file_operations ublk_ch_fops = {
static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
{
- int size = ublk_queue_cmd_buf_size(ub, q_id);
+ int size = ublk_queue_cmd_buf_size(ub);
struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
int i;
@@ -2615,7 +2626,7 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id)
ubq->flags = ub->dev_info.flags;
ubq->q_id = q_id;
ubq->q_depth = ub->dev_info.queue_depth;
- size = ublk_queue_cmd_buf_size(ub, q_id);
+ size = ublk_queue_cmd_buf_size(ub);
ptr = (void *) __get_free_pages(gfp_flags, get_order(size));
if (!ptr)
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index e649fa67bac1..f061420dfb10 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -829,9 +829,9 @@ out:
}
/* We provide getgeo only to please some old bootloader/partitioning tools */
-static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
+static int virtblk_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
- struct virtio_blk *vblk = bd->bd_disk->private_data;
+ struct virtio_blk *vblk = disk->private_data;
int ret = 0;
mutex_lock(&vblk->vdev_mutex);
@@ -853,7 +853,7 @@ static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
/* some standard values, similar to sd */
geo->heads = 1 << 6;
geo->sectors = 1 << 5;
- geo->cylinders = get_capacity(bd->bd_disk) >> 11;
+ geo->cylinders = get_capacity(disk) >> 11;
}
out:
mutex_unlock(&vblk->vdev_mutex);
@@ -1682,7 +1682,7 @@ static int __init virtio_blk_init(void)
{
int error;
- virtblk_wq = alloc_workqueue("virtio-blk", 0, 0);
+ virtblk_wq = alloc_workqueue("virtio-blk", WQ_PERCPU, 0);
if (!virtblk_wq)
return -ENOMEM;
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 5babe575c288..04fc6b552c04 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -493,11 +493,11 @@ static void blkif_restart_queue_callback(void *arg)
schedule_work(&rinfo->work);
}
-static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg)
+static int blkif_getgeo(struct gendisk *disk, struct hd_geometry *hg)
{
/* We don't have real geometry info, but let's at least return
values consistent with the size of the device */
- sector_t nsect = get_capacity(bd->bd_disk);
+ sector_t nsect = get_capacity(disk);
sector_t cylinders = nsect;
hg->heads = 0xff;
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 8acad3cc6e6e..78184d3c526a 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1085,7 +1085,7 @@ static int read_from_bdev_sync(struct zram *zram, struct page *page,
work.entry = entry;
INIT_WORK_ONSTACK(&work.work, zram_sync_read);
- queue_work(system_unbound_wq, &work.work);
+ queue_work(system_dfl_wq, &work.work);
flush_work(&work.work);
destroy_work_on_stack(&work.work);
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index ddb37f6670de..07c19b2182ca 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -37,6 +37,32 @@ config BLK_DEV_MD
If unsure, say N.
+config MD_BITMAP
+ bool "MD RAID bitmap support"
+ default y
+ depends on BLK_DEV_MD
+ help
+ If you say Y here, support for the write intent bitmap will be
+ enabled. The bitmap can be used to optimize resync speed after power
+ failure or readding a disk, limiting it to recorded dirty sectors in
+ bitmap.
+
+ This feature can be added to existing MD array or MD array can be
+ created with bitmap via mdadm(8).
+
+ If unsure, say Y.
+
+config MD_LLBITMAP
+ bool "MD RAID lockless bitmap support"
+ depends on BLK_DEV_MD
+ help
+ If you say Y here, support for the lockless write intent bitmap will
+ be enabled.
+
+ Note, this is an experimental feature.
+
+ If unsure, say N.
+
config MD_AUTODETECT
bool "Autodetect RAID arrays during kernel boot"
depends on BLK_DEV_MD=y
@@ -54,6 +80,7 @@ config MD_AUTODETECT
config MD_BITMAP_FILE
bool "MD bitmap file support (deprecated)"
default y
+ depends on MD_BITMAP
help
If you say Y here, support for write intent bitmaps in files on an
external file system is enabled. This is an alternative to the internal
@@ -174,6 +201,7 @@ config MD_RAID456
config MD_CLUSTER
tristate "Cluster Support for MD"
+ select MD_BITMAP
depends on BLK_DEV_MD
depends on DLM
default n
@@ -393,6 +421,7 @@ config DM_RAID
select MD_RAID1
select MD_RAID10
select MD_RAID456
+ select MD_BITMAP
select BLK_DEV_MD
help
A dm target that supports RAID1, RAID10, RAID4, RAID5 and RAID6 mappings
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 87bdfc9fe14c..5a51b3408b70 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -27,7 +27,9 @@ dm-clone-y += dm-clone-target.o dm-clone-metadata.o
dm-verity-y += dm-verity-target.o
dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o
-md-mod-y += md.o md-bitmap.o
+md-mod-y += md.o
+md-mod-$(CONFIG_MD_BITMAP) += md-bitmap.o
+md-mod-$(CONFIG_MD_LLBITMAP) += md-llbitmap.o
raid456-y += raid5.o raid5-cache.o raid5-ppl.o
linear-y += md-linear.o
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 7510d1c983a5..f327456fc4e0 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -115,8 +115,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
check = bio_kmalloc(nr_segs, GFP_NOIO);
if (!check)
return;
- bio_init(check, bio->bi_bdev, check->bi_inline_vecs, nr_segs,
- REQ_OP_READ);
+ bio_init_inline(check, bio->bi_bdev, nr_segs, REQ_OP_READ);
check->bi_iter.bi_sector = bio->bi_iter.bi_sector;
check->bi_iter.bi_size = bio->bi_iter.bi_size;
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index 020712c5203f..2386d08bf4e4 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -26,8 +26,7 @@ struct bio *bch_bbio_alloc(struct cache_set *c)
struct bbio *b = mempool_alloc(&c->bio_meta, GFP_NOIO);
struct bio *bio = &b->bio;
- bio_init(bio, NULL, bio->bi_inline_vecs,
- meta_bucket_pages(&c->cache->sb), 0);
+ bio_init_inline(bio, NULL, meta_bucket_pages(&c->cache->sb), 0);
return bio;
}
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 7ff14bd2feb8..d50eb82ccb4f 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -615,7 +615,7 @@ static void do_journal_discard(struct cache *ca)
atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT);
- bio_init(bio, ca->bdev, bio->bi_inline_vecs, 1, REQ_OP_DISCARD);
+ bio_init_inline(bio, ca->bdev, 1, REQ_OP_DISCARD);
bio->bi_iter.bi_sector = bucket_to_sector(ca->set,
ca->sb.d[ja->discard_idx]);
bio->bi_iter.bi_size = bucket_bytes(ca);
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 26a6a535ec32..73918e55bf04 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -79,7 +79,7 @@ static void moving_init(struct moving_io *io)
{
struct bio *bio = &io->bio.bio;
- bio_init(bio, NULL, bio->bi_inline_vecs,
+ bio_init_inline(bio, NULL,
DIV_ROUND_UP(KEY_SIZE(&io->w->key), PAGE_SECTORS), 0);
bio_get(bio);
bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
@@ -145,9 +145,9 @@ static void read_moving(struct cache_set *c)
continue;
}
- io = kzalloc(struct_size(io, bio.bio.bi_inline_vecs,
- DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)),
- GFP_KERNEL);
+ io = kzalloc(sizeof(*io) + sizeof(struct bio_vec) *
+ DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
+ GFP_KERNEL);
if (!io)
goto err;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 1492c8552255..6d250e366412 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -2236,7 +2236,7 @@ static int cache_alloc(struct cache *ca)
__module_get(THIS_MODULE);
kobject_init(&ca->kobj, &bch_cache_ktype);
- bio_init(&ca->journal.bio, NULL, ca->journal.bio.bi_inline_vecs, 8, 0);
+ bio_init_inline(&ca->journal.bio, NULL, 8, 0);
/*
* When the cache disk is first registered, ca->sb.njournal_buckets
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 302e75f1fc4b..6ba73dc1a3df 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -331,7 +331,7 @@ static void dirty_init(struct keybuf_key *w)
struct dirty_io *io = w->private;
struct bio *bio = &io->bio;
- bio_init(bio, NULL, bio->bi_inline_vecs,
+ bio_init_inline(bio, NULL,
DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 0);
if (!io->dc->writeback_percent)
bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
@@ -536,9 +536,9 @@ static void read_dirty(struct cached_dev *dc)
for (i = 0; i < nk; i++) {
w = keys[i];
- io = kzalloc(struct_size(io, bio.bi_inline_vecs,
- DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)),
- GFP_KERNEL);
+ io = kzalloc(sizeof(*io) + sizeof(struct bio_vec) *
+ DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
+ GFP_KERNEL);
if (!io)
goto err;
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index ff7595caf440..8f3a23f4b168 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1342,7 +1342,7 @@ static void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector,
use_dmio(b, op, sector, n_sectors, offset, ioprio);
return;
}
- bio_init(bio, b->c->bdev, bio->bi_inline_vecs, 1, op);
+ bio_init_inline(bio, b->c->bdev, 1, op);
bio->bi_iter.bi_sector = sector;
bio->bi_end_io = bio_complete;
bio->bi_private = b;
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index cf17fd46e255..08925aca838c 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -441,7 +441,7 @@ static struct bio *clone_bio(struct dm_target *ti, struct flakey_c *fc, struct b
if (!clone)
return NULL;
- bio_init(clone, fc->dev->bdev, clone->bi_inline_vecs, nr_iovecs, bio->bi_opf);
+ bio_init_inline(clone, fc->dev->bdev, nr_iovecs, bio->bi_opf);
clone->bi_iter.bi_sector = flakey_map_sector(ti, bio->bi_iter.bi_sector);
clone->bi_private = bio;
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 79ea85d18e24..967d72929909 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3953,9 +3953,11 @@ static int __load_dirty_region_bitmap(struct raid_set *rs)
!test_and_set_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags)) {
struct mddev *mddev = &rs->md;
- r = mddev->bitmap_ops->load(mddev);
- if (r)
- DMERR("Failed to load bitmap");
+ if (md_bitmap_enabled(mddev, false)) {
+ r = mddev->bitmap_ops->load(mddev);
+ if (r)
+ DMERR("Failed to load bitmap");
+ }
}
return r;
@@ -4070,10 +4072,12 @@ static int raid_preresume(struct dm_target *ti)
mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)))) {
int chunksize = to_bytes(rs->requested_bitmap_chunk_sectors) ?: mddev->bitmap_info.chunksize;
- r = mddev->bitmap_ops->resize(mddev, mddev->dev_sectors,
- chunksize, false);
- if (r)
- DMERR("Failed to resize bitmap");
+ if (md_bitmap_enabled(mddev, false)) {
+ r = mddev->bitmap_ops->resize(mddev, mddev->dev_sectors,
+ chunksize);
+ if (r)
+ DMERR("Failed to resize bitmap");
+ }
}
/* Check for any resize/reshape on @rs and adjust/initiate */
diff --git a/drivers/md/dm-vdo/vio.c b/drivers/md/dm-vdo/vio.c
index e7f4153e55e3..8fc22fb14196 100644
--- a/drivers/md/dm-vdo/vio.c
+++ b/drivers/md/dm-vdo/vio.c
@@ -212,7 +212,7 @@ int vio_reset_bio_with_size(struct vio *vio, char *data, int size, bio_end_io_t
return VDO_SUCCESS;
bio->bi_ioprio = 0;
- bio->bi_io_vec = bio->bi_inline_vecs;
+ bio->bi_io_vec = bio_inline_vecs(bio);
bio->bi_max_vecs = vio->block_count + 1;
if (VDO_ASSERT(size <= vio_size, "specified size %d is not greater than allocated %d",
size, vio_size) != VDO_SUCCESS)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index a44e8c2dccee..7bd6fa05b00a 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -403,9 +403,9 @@ static void do_deferred_remove(struct work_struct *w)
dm_deferred_remove();
}
-static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+static int dm_blk_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
- struct mapped_device *md = bdev->bd_disk->private_data;
+ struct mapped_device *md = disk->private_data;
return dm_get_geometry(md, geo);
}
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 334b71404930..84b7e2af6dba 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -34,15 +34,6 @@
#include "md-bitmap.h"
#include "md-cluster.h"
-#define BITMAP_MAJOR_LO 3
-/* version 4 insists the bitmap is in little-endian order
- * with version 3, it is host-endian which is non-portable
- * Version 5 is currently set only for clustered devices
- */
-#define BITMAP_MAJOR_HI 4
-#define BITMAP_MAJOR_CLUSTERED 5
-#define BITMAP_MAJOR_HOSTENDIAN 3
-
/*
* in-memory bitmap:
*
@@ -224,6 +215,8 @@ struct bitmap {
int cluster_slot;
};
+static struct workqueue_struct *md_bitmap_wq;
+
static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks,
int chunksize, bool init);
@@ -232,20 +225,19 @@ static inline char *bmname(struct bitmap *bitmap)
return bitmap->mddev ? mdname(bitmap->mddev) : "mdX";
}
-static bool __bitmap_enabled(struct bitmap *bitmap)
-{
- return bitmap->storage.filemap &&
- !test_bit(BITMAP_STALE, &bitmap->flags);
-}
-
-static bool bitmap_enabled(struct mddev *mddev)
+static bool bitmap_enabled(void *data, bool flush)
{
- struct bitmap *bitmap = mddev->bitmap;
+ struct bitmap *bitmap = data;
- if (!bitmap)
- return false;
+ if (!flush)
+ return true;
- return __bitmap_enabled(bitmap);
+ /*
+ * If caller want to flush bitmap pages to underlying disks, check if
+ * there are cached pages in filemap.
+ */
+ return !test_bit(BITMAP_STALE, &bitmap->flags) &&
+ bitmap->storage.filemap != NULL;
}
/*
@@ -484,7 +476,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
return -EINVAL;
}
- md_super_write(mddev, rdev, sboff + ps, (int)min(size, bitmap_limit), page);
+ md_write_metadata(mddev, rdev, sboff + ps, (int)min(size, bitmap_limit),
+ page, 0);
return 0;
}
@@ -1244,7 +1237,7 @@ static void __bitmap_unplug(struct bitmap *bitmap)
int dirty, need_write;
int writing = 0;
- if (!__bitmap_enabled(bitmap))
+ if (!bitmap_enabled(bitmap, true))
return;
/* look at each page to see if there are any set bits that need to be
@@ -1788,15 +1781,9 @@ static bool __bitmap_start_sync(struct bitmap *bitmap, sector_t offset,
sector_t *blocks, bool degraded)
{
bitmap_counter_t *bmc;
- bool rv;
+ bool rv = false;
- if (bitmap == NULL) {/* FIXME or bitmap set as 'failed' */
- *blocks = 1024;
- return true; /* always resync if no bitmap */
- }
spin_lock_irq(&bitmap->counts.lock);
-
- rv = false;
bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0);
if (bmc) {
/* locked */
@@ -1845,10 +1832,6 @@ static void __bitmap_end_sync(struct bitmap *bitmap, sector_t offset,
bitmap_counter_t *bmc;
unsigned long flags;
- if (bitmap == NULL) {
- *blocks = 1024;
- return;
- }
spin_lock_irqsave(&bitmap->counts.lock, flags);
bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0);
if (bmc == NULL)
@@ -2060,9 +2043,6 @@ static void bitmap_start_behind_write(struct mddev *mddev)
struct bitmap *bitmap = mddev->bitmap;
int bw;
- if (!bitmap)
- return;
-
atomic_inc(&bitmap->behind_writes);
bw = atomic_read(&bitmap->behind_writes);
if (bw > bitmap->behind_writes_used)
@@ -2076,9 +2056,6 @@ static void bitmap_end_behind_write(struct mddev *mddev)
{
struct bitmap *bitmap = mddev->bitmap;
- if (!bitmap)
- return;
-
if (atomic_dec_and_test(&bitmap->behind_writes))
wake_up(&bitmap->behind_wait);
pr_debug("dec write-behind count %d/%lu\n",
@@ -2593,15 +2570,14 @@ err:
return ret;
}
-static int bitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize,
- bool init)
+static int bitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize)
{
struct bitmap *bitmap = mddev->bitmap;
if (!bitmap)
return 0;
- return __bitmap_resize(bitmap, blocks, chunksize, init);
+ return __bitmap_resize(bitmap, blocks, chunksize, false);
}
static ssize_t
@@ -2990,12 +2966,19 @@ static struct attribute *md_bitmap_attrs[] = {
&max_backlog_used.attr,
NULL
};
-const struct attribute_group md_bitmap_group = {
+
+static struct attribute_group md_bitmap_group = {
.name = "bitmap",
.attrs = md_bitmap_attrs,
};
static struct bitmap_operations bitmap_ops = {
+ .head = {
+ .type = MD_BITMAP,
+ .id = ID_BITMAP,
+ .name = "bitmap",
+ },
+
.enabled = bitmap_enabled,
.create = bitmap_create,
.resize = bitmap_resize,
@@ -3013,6 +2996,9 @@ static struct bitmap_operations bitmap_ops = {
.start_write = bitmap_start_write,
.end_write = bitmap_end_write,
+ .start_discard = bitmap_start_write,
+ .end_discard = bitmap_end_write,
+
.start_sync = bitmap_start_sync,
.end_sync = bitmap_end_sync,
.cond_end_sync = bitmap_cond_end_sync,
@@ -3026,9 +3012,22 @@ static struct bitmap_operations bitmap_ops = {
.copy_from_slot = bitmap_copy_from_slot,
.set_pages = bitmap_set_pages,
.free = md_bitmap_free,
+
+ .group = &md_bitmap_group,
};
-void mddev_set_bitmap_ops(struct mddev *mddev)
+int md_bitmap_init(void)
+{
+ md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND,
+ 0);
+ if (!md_bitmap_wq)
+ return -ENOMEM;
+
+ return register_md_submodule(&bitmap_ops.head);
+}
+
+void md_bitmap_exit(void)
{
- mddev->bitmap_ops = &bitmap_ops;
+ destroy_workqueue(md_bitmap_wq);
+ unregister_md_submodule(&bitmap_ops.head);
}
diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h
index 59e9dd45cfde..b42a28fa83a0 100644
--- a/drivers/md/md-bitmap.h
+++ b/drivers/md/md-bitmap.h
@@ -9,10 +9,26 @@
#define BITMAP_MAGIC 0x6d746962
+/*
+ * version 3 is host-endian order, this is deprecated and not used for new
+ * array
+ */
+#define BITMAP_MAJOR_LO 3
+#define BITMAP_MAJOR_HOSTENDIAN 3
+/* version 4 is little-endian order, the default value */
+#define BITMAP_MAJOR_HI 4
+/* version 5 is only used for cluster */
+#define BITMAP_MAJOR_CLUSTERED 5
+/* version 6 is only used for lockless bitmap */
+#define BITMAP_MAJOR_LOCKLESS 6
+
/* use these for bitmap->flags and bitmap->sb->state bit-fields */
enum bitmap_state {
- BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */
+ BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */
BITMAP_WRITE_ERROR = 2, /* A write error has occurred */
+ BITMAP_FIRST_USE = 3, /* llbitmap is just created */
+ BITMAP_CLEAN = 4, /* llbitmap is created with assume_clean */
+ BITMAP_DAEMON_BUSY = 5, /* llbitmap daemon is not finished after daemon_sleep */
BITMAP_HOSTENDIAN =15,
};
@@ -61,11 +77,15 @@ struct md_bitmap_stats {
struct file *file;
};
+typedef void (md_bitmap_fn)(struct mddev *mddev, sector_t offset,
+ unsigned long sectors);
+
struct bitmap_operations {
- bool (*enabled)(struct mddev *mddev);
+ struct md_submodule_head head;
+
+ bool (*enabled)(void *data, bool flush);
int (*create)(struct mddev *mddev);
- int (*resize)(struct mddev *mddev, sector_t blocks, int chunksize,
- bool init);
+ int (*resize)(struct mddev *mddev, sector_t blocks, int chunksize);
int (*load)(struct mddev *mddev);
void (*destroy)(struct mddev *mddev);
@@ -80,10 +100,13 @@ struct bitmap_operations {
void (*end_behind_write)(struct mddev *mddev);
void (*wait_behind_writes)(struct mddev *mddev);
- void (*start_write)(struct mddev *mddev, sector_t offset,
- unsigned long sectors);
- void (*end_write)(struct mddev *mddev, sector_t offset,
- unsigned long sectors);
+ md_bitmap_fn *start_write;
+ md_bitmap_fn *end_write;
+ md_bitmap_fn *start_discard;
+ md_bitmap_fn *end_discard;
+
+ sector_t (*skip_sync_blocks)(struct mddev *mddev, sector_t offset);
+ bool (*blocks_synced)(struct mddev *mddev, sector_t offset);
bool (*start_sync)(struct mddev *mddev, sector_t offset,
sector_t *blocks, bool degraded);
void (*end_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks);
@@ -101,9 +124,75 @@ struct bitmap_operations {
sector_t *hi, bool clear_bits);
void (*set_pages)(void *data, unsigned long pages);
void (*free)(void *data);
+
+ struct attribute_group *group;
};
/* the bitmap API */
-void mddev_set_bitmap_ops(struct mddev *mddev);
+static inline bool md_bitmap_registered(struct mddev *mddev)
+{
+ return mddev->bitmap_ops != NULL;
+}
+
+static inline bool md_bitmap_enabled(struct mddev *mddev, bool flush)
+{
+ /* bitmap_ops must be registered before creating bitmap. */
+ if (!md_bitmap_registered(mddev))
+ return false;
+
+ if (!mddev->bitmap)
+ return false;
+
+ return mddev->bitmap_ops->enabled(mddev->bitmap, flush);
+}
+
+static inline bool md_bitmap_start_sync(struct mddev *mddev, sector_t offset,
+ sector_t *blocks, bool degraded)
+{
+ /* always resync if no bitmap */
+ if (!md_bitmap_enabled(mddev, false)) {
+ *blocks = 1024;
+ return true;
+ }
+
+ return mddev->bitmap_ops->start_sync(mddev, offset, blocks, degraded);
+}
+
+static inline void md_bitmap_end_sync(struct mddev *mddev, sector_t offset,
+ sector_t *blocks)
+{
+ if (!md_bitmap_enabled(mddev, false)) {
+ *blocks = 1024;
+ return;
+ }
+
+ mddev->bitmap_ops->end_sync(mddev, offset, blocks);
+}
+
+#ifdef CONFIG_MD_BITMAP
+int md_bitmap_init(void);
+void md_bitmap_exit(void);
+#else
+static inline int md_bitmap_init(void)
+{
+ return 0;
+}
+static inline void md_bitmap_exit(void)
+{
+}
+#endif
+
+#ifdef CONFIG_MD_LLBITMAP
+int md_llbitmap_init(void);
+void md_llbitmap_exit(void);
+#else
+static inline int md_llbitmap_init(void)
+{
+ return 0;
+}
+static inline void md_llbitmap_exit(void)
+{
+}
+#endif
#endif
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 5497eaee96e7..6a7deb7e545b 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -630,7 +630,7 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
if (le64_to_cpu(msg->high) != mddev->pers->size(mddev, 0, 0))
ret = mddev->bitmap_ops->resize(mddev,
le64_to_cpu(msg->high),
- 0, false);
+ 0);
break;
default:
ret = -1;
diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
index 5d9b08115375..76f85cc32942 100644
--- a/drivers/md/md-linear.c
+++ b/drivers/md/md-linear.c
@@ -256,18 +256,10 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio)
if (unlikely(bio_end_sector(bio) > end_sector)) {
/* This bio crosses a device boundary, so we have to split it */
- struct bio *split = bio_split(bio, end_sector - bio_sector,
- GFP_NOIO, &mddev->bio_set);
-
- if (IS_ERR(split)) {
- bio->bi_status = errno_to_blk_status(PTR_ERR(split));
- bio_endio(bio);
+ bio = bio_submit_split_bioset(bio, end_sector - bio_sector,
+ &mddev->bio_set);
+ if (!bio)
return true;
- }
-
- bio_chain(split, bio);
- submit_bio_noacct(bio);
- bio = split;
}
md_account_bio(mddev, &bio);
diff --git a/drivers/md/md-llbitmap.c b/drivers/md/md-llbitmap.c
new file mode 100644
index 000000000000..1eb434306162
--- /dev/null
+++ b/drivers/md/md-llbitmap.c
@@ -0,0 +1,1626 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/timer.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/file.h>
+#include <linux/seq_file.h>
+#include <trace/events/block.h>
+
+#include "md.h"
+#include "md-bitmap.h"
+
+/*
+ * #### Background
+ *
+ * Redundant data is used to enhance data fault tolerance, and the storage
+ * methods for redundant data vary depending on the RAID levels. And it's
+ * important to maintain the consistency of redundant data.
+ *
+ * Bitmap is used to record which data blocks have been synchronized and which
+ * ones need to be resynchronized or recovered. Each bit in the bitmap
+ * represents a segment of data in the array. When a bit is set, it indicates
+ * that the multiple redundant copies of that data segment may not be
+ * consistent. Data synchronization can be performed based on the bitmap after
+ * power failure or readding a disk. If there is no bitmap, a full disk
+ * synchronization is required.
+ *
+ * #### Key Features
+ *
+ * - IO fastpath is lockless, if user issues lots of write IO to the same
+ * bitmap bit in a short time, only the first write has additional overhead
+ * to update bitmap bit, no additional overhead for the following writes;
+ * - support only resync or recover written data, means in the case creating
+ * new array or replacing with a new disk, there is no need to do a full disk
+ * resync/recovery;
+ *
+ * #### Key Concept
+ *
+ * ##### State Machine
+ *
+ * Each bit is one byte, contain 6 different states, see llbitmap_state. And
+ * there are total 8 different actions, see llbitmap_action, can change state:
+ *
+ * llbitmap state machine: transitions between states
+ *
+ * | | Startwrite | Startsync | Endsync | Abortsync|
+ * | --------- | ---------- | --------- | ------- | ------- |
+ * | Unwritten | Dirty | x | x | x |
+ * | Clean | Dirty | x | x | x |
+ * | Dirty | x | x | x | x |
+ * | NeedSync | x | Syncing | x | x |
+ * | Syncing | x | Syncing | Dirty | NeedSync |
+ *
+ * | | Reload | Daemon | Discard | Stale |
+ * | --------- | -------- | ------ | --------- | --------- |
+ * | Unwritten | x | x | x | x |
+ * | Clean | x | x | Unwritten | NeedSync |
+ * | Dirty | NeedSync | Clean | Unwritten | NeedSync |
+ * | NeedSync | x | x | Unwritten | x |
+ * | Syncing | NeedSync | x | Unwritten | NeedSync |
+ *
+ * Typical scenarios:
+ *
+ * 1) Create new array
+ * All bits will be set to Unwritten by default, if --assume-clean is set,
+ * all bits will be set to Clean instead.
+ *
+ * 2) write data, raid1/raid10 have full copy of data, while raid456 doesn't and
+ * rely on xor data
+ *
+ * 2.1) write new data to raid1/raid10:
+ * Unwritten --StartWrite--> Dirty
+ *
+ * 2.2) write new data to raid456:
+ * Unwritten --StartWrite--> NeedSync
+ *
+ * Because the initial recover for raid456 is skipped, the xor data is not built
+ * yet, the bit must be set to NeedSync first and after lazy initial recover is
+ * finished, the bit will finally set to Dirty(see 5.1 and 5.4);
+ *
+ * 2.3) cover write
+ * Clean --StartWrite--> Dirty
+ *
+ * 3) daemon, if the array is not degraded:
+ * Dirty --Daemon--> Clean
+ *
+ * 4) discard
+ * {Clean, Dirty, NeedSync, Syncing} --Discard--> Unwritten
+ *
+ * 5) resync and recover
+ *
+ * 5.1) common process
+ * NeedSync --Startsync--> Syncing --Endsync--> Dirty --Daemon--> Clean
+ *
+ * 5.2) resync after power failure
+ * Dirty --Reload--> NeedSync
+ *
+ * 5.3) recover while replacing with a new disk
+ * By default, the old bitmap framework will recover all data, and llbitmap
+ * implements this by a new helper, see llbitmap_skip_sync_blocks:
+ *
+ * skip recover for bits other than dirty or clean;
+ *
+ * 5.4) lazy initial recover for raid5:
+ * By default, the old bitmap framework will only allow new recover when there
+ * are spares(new disk), a new recovery flag MD_RECOVERY_LAZY_RECOVER is added
+ * to perform raid456 lazy recover for set bits(from 2.2).
+ *
+ * 6. special handling for degraded array:
+ *
+ * - Dirty bits will never be cleared, daemon will just do nothing, so that if
+ * a disk is readded, Clean bits can be skipped with recovery;
+ * - Dirty bits will convert to Syncing from start write, to do data recovery
+ * for new added disks;
+ * - New write will convert bits to NeedSync directly;
+ *
+ * ##### Bitmap IO
+ *
+ * ##### Chunksize
+ *
+ * The default bitmap size is 128k, incluing 1k bitmap super block, and
+ * the default size of segment of data in the array each bit(chunksize) is 64k,
+ * and chunksize will adjust to twice the old size each time if the total number
+ * bits is not less than 127k.(see llbitmap_init)
+ *
+ * ##### READ
+ *
+ * While creating bitmap, all pages will be allocated and read for llbitmap,
+ * there won't be read afterwards
+ *
+ * ##### WRITE
+ *
+ * WRITE IO is divided into logical_block_size of the array, the dirty state
+ * of each block is tracked independently, for example:
+ *
+ * each page is 4k, contain 8 blocks; each block is 512 bytes contain 512 bit;
+ *
+ * | page0 | page1 | ... | page 31 |
+ * | |
+ * | \-----------------------\
+ * | |
+ * | block0 | block1 | ... | block 8|
+ * | |
+ * | \-----------------\
+ * | |
+ * | bit0 | bit1 | ... | bit511 |
+ *
+ * From IO path, if one bit is changed to Dirty or NeedSync, the corresponding
+ * subpage will be marked dirty, such block must write first before the IO is
+ * issued. This behaviour will affect IO performance, to reduce the impact, if
+ * multiple bits are changed in the same block in a short time, all bits in this
+ * block will be changed to Dirty/NeedSync, so that there won't be any overhead
+ * until daemon clears dirty bits.
+ *
+ * ##### Dirty Bits synchronization
+ *
+ * IO fast path will set bits to dirty, and those dirty bits will be cleared
+ * by daemon after IO is done. llbitmap_page_ctl is used to synchronize between
+ * IO path and daemon;
+ *
+ * IO path:
+ * 1) try to grab a reference, if succeed, set expire time after 5s and return;
+ * 2) if failed to grab a reference, wait for daemon to finish clearing dirty
+ * bits;
+ *
+ * Daemon (Daemon will be woken up every daemon_sleep seconds):
+ * For each page:
+ * 1) check if page expired, if not skip this page; for expired page:
+ * 2) suspend the page and wait for inflight write IO to be done;
+ * 3) change dirty page to clean;
+ * 4) resume the page;
+ */
+
+#define BITMAP_DATA_OFFSET 1024
+
+/* 64k is the max IO size of sync IO for raid1/raid10 */
+#define MIN_CHUNK_SIZE (64 * 2)
+
+/* By default, daemon will be woken up every 30s */
+#define DEFAULT_DAEMON_SLEEP 30
+
+/*
+ * Dirtied bits that have not been accessed for more than 5s will be cleared
+ * by daemon.
+ */
+#define DEFAULT_BARRIER_IDLE 5
+
+enum llbitmap_state {
+ /* No valid data, init state after assemble the array */
+ BitUnwritten = 0,
+ /* data is consistent */
+ BitClean,
+ /* data will be consistent after IO is done, set directly for writes */
+ BitDirty,
+ /*
+ * data need to be resynchronized:
+ * 1) set directly for writes if array is degraded, prevent full disk
+ * synchronization after readding a disk;
+ * 2) reassemble the array after power failure, and dirty bits are
+ * found after reloading the bitmap;
+ * 3) set for first write for raid5, to build initial xor data lazily
+ */
+ BitNeedSync,
+ /* data is synchronizing */
+ BitSyncing,
+ BitStateCount,
+ BitNone = 0xff,
+};
+
+enum llbitmap_action {
+ /* User write new data, this is the only action from IO fast path */
+ BitmapActionStartwrite = 0,
+ /* Start recovery */
+ BitmapActionStartsync,
+ /* Finish recovery */
+ BitmapActionEndsync,
+ /* Failed recovery */
+ BitmapActionAbortsync,
+ /* Reassemble the array */
+ BitmapActionReload,
+ /* Daemon thread is trying to clear dirty bits */
+ BitmapActionDaemon,
+ /* Data is deleted */
+ BitmapActionDiscard,
+ /*
+ * Bitmap is stale, mark all bits in addition to BitUnwritten to
+ * BitNeedSync.
+ */
+ BitmapActionStale,
+ BitmapActionCount,
+ /* Init state is BitUnwritten */
+ BitmapActionInit,
+};
+
+enum llbitmap_page_state {
+ LLPageFlush = 0,
+ LLPageDirty,
+};
+
+struct llbitmap_page_ctl {
+ char *state;
+ struct page *page;
+ unsigned long expire;
+ unsigned long flags;
+ wait_queue_head_t wait;
+ struct percpu_ref active;
+ /* Per block size dirty state, maximum 64k page / 1 sector = 128 */
+ unsigned long dirty[];
+};
+
+struct llbitmap {
+ struct mddev *mddev;
+ struct llbitmap_page_ctl **pctl;
+
+ unsigned int nr_pages;
+ unsigned int io_size;
+ unsigned int blocks_per_page;
+
+ /* shift of one chunk */
+ unsigned long chunkshift;
+ /* size of one chunk in sector */
+ unsigned long chunksize;
+ /* total number of chunks */
+ unsigned long chunks;
+ unsigned long last_end_sync;
+ /*
+ * time in seconds that dirty bits will be cleared if the page is not
+ * accessed.
+ */
+ unsigned long barrier_idle;
+ /* fires on first BitDirty state */
+ struct timer_list pending_timer;
+ struct work_struct daemon_work;
+
+ unsigned long flags;
+ __u64 events_cleared;
+
+ /* for slow disks */
+ atomic_t behind_writes;
+ wait_queue_head_t behind_wait;
+};
+
+struct llbitmap_unplug_work {
+ struct work_struct work;
+ struct llbitmap *llbitmap;
+ struct completion *done;
+};
+
+static struct workqueue_struct *md_llbitmap_io_wq;
+static struct workqueue_struct *md_llbitmap_unplug_wq;
+
+static char state_machine[BitStateCount][BitmapActionCount] = {
+ [BitUnwritten] = {
+ [BitmapActionStartwrite] = BitDirty,
+ [BitmapActionStartsync] = BitNone,
+ [BitmapActionEndsync] = BitNone,
+ [BitmapActionAbortsync] = BitNone,
+ [BitmapActionReload] = BitNone,
+ [BitmapActionDaemon] = BitNone,
+ [BitmapActionDiscard] = BitNone,
+ [BitmapActionStale] = BitNone,
+ },
+ [BitClean] = {
+ [BitmapActionStartwrite] = BitDirty,
+ [BitmapActionStartsync] = BitNone,
+ [BitmapActionEndsync] = BitNone,
+ [BitmapActionAbortsync] = BitNone,
+ [BitmapActionReload] = BitNone,
+ [BitmapActionDaemon] = BitNone,
+ [BitmapActionDiscard] = BitUnwritten,
+ [BitmapActionStale] = BitNeedSync,
+ },
+ [BitDirty] = {
+ [BitmapActionStartwrite] = BitNone,
+ [BitmapActionStartsync] = BitNone,
+ [BitmapActionEndsync] = BitNone,
+ [BitmapActionAbortsync] = BitNone,
+ [BitmapActionReload] = BitNeedSync,
+ [BitmapActionDaemon] = BitClean,
+ [BitmapActionDiscard] = BitUnwritten,
+ [BitmapActionStale] = BitNeedSync,
+ },
+ [BitNeedSync] = {
+ [BitmapActionStartwrite] = BitNone,
+ [BitmapActionStartsync] = BitSyncing,
+ [BitmapActionEndsync] = BitNone,
+ [BitmapActionAbortsync] = BitNone,
+ [BitmapActionReload] = BitNone,
+ [BitmapActionDaemon] = BitNone,
+ [BitmapActionDiscard] = BitUnwritten,
+ [BitmapActionStale] = BitNone,
+ },
+ [BitSyncing] = {
+ [BitmapActionStartwrite] = BitNone,
+ [BitmapActionStartsync] = BitSyncing,
+ [BitmapActionEndsync] = BitDirty,
+ [BitmapActionAbortsync] = BitNeedSync,
+ [BitmapActionReload] = BitNeedSync,
+ [BitmapActionDaemon] = BitNone,
+ [BitmapActionDiscard] = BitUnwritten,
+ [BitmapActionStale] = BitNeedSync,
+ },
+};
+
+static void __llbitmap_flush(struct mddev *mddev);
+
+static enum llbitmap_state llbitmap_read(struct llbitmap *llbitmap, loff_t pos)
+{
+ unsigned int idx;
+ unsigned int offset;
+
+ pos += BITMAP_DATA_OFFSET;
+ idx = pos >> PAGE_SHIFT;
+ offset = offset_in_page(pos);
+
+ return llbitmap->pctl[idx]->state[offset];
+}
+
+/* set all the bits in the subpage as dirty */
+static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap,
+ struct llbitmap_page_ctl *pctl,
+ unsigned int block)
+{
+ bool level_456 = raid_is_456(llbitmap->mddev);
+ unsigned int io_size = llbitmap->io_size;
+ int pos;
+
+ for (pos = block * io_size; pos < (block + 1) * io_size; pos++) {
+ switch (pctl->state[pos]) {
+ case BitUnwritten:
+ pctl->state[pos] = level_456 ? BitNeedSync : BitDirty;
+ break;
+ case BitClean:
+ pctl->state[pos] = BitDirty;
+ break;
+ };
+ }
+}
+
+static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx,
+ int offset)
+{
+ struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
+ unsigned int io_size = llbitmap->io_size;
+ int block = offset / io_size;
+ int pos;
+
+ if (!test_bit(LLPageDirty, &pctl->flags))
+ set_bit(LLPageDirty, &pctl->flags);
+
+ /*
+ * For degraded array, dirty bits will never be cleared, and we must
+ * resync all the dirty bits, hence skip infect new dirty bits to
+ * prevent resync unnecessary data.
+ */
+ if (llbitmap->mddev->degraded) {
+ set_bit(block, pctl->dirty);
+ return;
+ }
+
+ /*
+ * The subpage usually contains a total of 512 bits. If any single bit
+ * within the subpage is marked as dirty, the entire sector will be
+ * written. To avoid impacting write performance, when multiple bits
+ * within the same sector are modified within llbitmap->barrier_idle,
+ * all bits in the sector will be collectively marked as dirty at once.
+ */
+ if (test_and_set_bit(block, pctl->dirty)) {
+ llbitmap_infect_dirty_bits(llbitmap, pctl, block);
+ return;
+ }
+
+ for (pos = block * io_size; pos < (block + 1) * io_size; pos++) {
+ if (pos == offset)
+ continue;
+ if (pctl->state[pos] == BitDirty ||
+ pctl->state[pos] == BitNeedSync) {
+ llbitmap_infect_dirty_bits(llbitmap, pctl, block);
+ return;
+ }
+ }
+}
+
+static void llbitmap_write(struct llbitmap *llbitmap, enum llbitmap_state state,
+ loff_t pos)
+{
+ unsigned int idx;
+ unsigned int bit;
+
+ pos += BITMAP_DATA_OFFSET;
+ idx = pos >> PAGE_SHIFT;
+ bit = offset_in_page(pos);
+
+ llbitmap->pctl[idx]->state[bit] = state;
+ if (state == BitDirty || state == BitNeedSync)
+ llbitmap_set_page_dirty(llbitmap, idx, bit);
+}
+
+static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx)
+{
+ struct mddev *mddev = llbitmap->mddev;
+ struct page *page = NULL;
+ struct md_rdev *rdev;
+
+ if (llbitmap->pctl && llbitmap->pctl[idx])
+ page = llbitmap->pctl[idx]->page;
+ if (page)
+ return page;
+
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
+ rdev_for_each(rdev, mddev) {
+ sector_t sector;
+
+ if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
+ continue;
+
+ sector = mddev->bitmap_info.offset +
+ (idx << PAGE_SECTORS_SHIFT);
+
+ if (sync_page_io(rdev, sector, PAGE_SIZE, page, REQ_OP_READ,
+ true))
+ return page;
+
+ md_error(mddev, rdev);
+ }
+
+ __free_page(page);
+ return ERR_PTR(-EIO);
+}
+
+static void llbitmap_write_page(struct llbitmap *llbitmap, int idx)
+{
+ struct page *page = llbitmap->pctl[idx]->page;
+ struct mddev *mddev = llbitmap->mddev;
+ struct md_rdev *rdev;
+ int block;
+
+ for (block = 0; block < llbitmap->blocks_per_page; block++) {
+ struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
+
+ if (!test_and_clear_bit(block, pctl->dirty))
+ continue;
+
+ rdev_for_each(rdev, mddev) {
+ sector_t sector;
+ sector_t bit_sector = llbitmap->io_size >> SECTOR_SHIFT;
+
+ if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
+ continue;
+
+ sector = mddev->bitmap_info.offset + rdev->sb_start +
+ (idx << PAGE_SECTORS_SHIFT) +
+ block * bit_sector;
+ md_write_metadata(mddev, rdev, sector,
+ llbitmap->io_size, page,
+ block * llbitmap->io_size);
+ }
+ }
+}
+
+static void active_release(struct percpu_ref *ref)
+{
+ struct llbitmap_page_ctl *pctl =
+ container_of(ref, struct llbitmap_page_ctl, active);
+
+ wake_up(&pctl->wait);
+}
+
+static void llbitmap_free_pages(struct llbitmap *llbitmap)
+{
+ int i;
+
+ if (!llbitmap->pctl)
+ return;
+
+ for (i = 0; i < llbitmap->nr_pages; i++) {
+ struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
+
+ if (!pctl || !pctl->page)
+ break;
+
+ __free_page(pctl->page);
+ percpu_ref_exit(&pctl->active);
+ }
+
+ kfree(llbitmap->pctl[0]);
+ kfree(llbitmap->pctl);
+ llbitmap->pctl = NULL;
+}
+
+static int llbitmap_cache_pages(struct llbitmap *llbitmap)
+{
+ struct llbitmap_page_ctl *pctl;
+ unsigned int nr_pages = DIV_ROUND_UP(llbitmap->chunks +
+ BITMAP_DATA_OFFSET, PAGE_SIZE);
+ unsigned int size = struct_size(pctl, dirty, BITS_TO_LONGS(
+ llbitmap->blocks_per_page));
+ int i;
+
+ llbitmap->pctl = kmalloc_array(nr_pages, sizeof(void *),
+ GFP_KERNEL | __GFP_ZERO);
+ if (!llbitmap->pctl)
+ return -ENOMEM;
+
+ size = round_up(size, cache_line_size());
+ pctl = kmalloc_array(nr_pages, size, GFP_KERNEL | __GFP_ZERO);
+ if (!pctl) {
+ kfree(llbitmap->pctl);
+ return -ENOMEM;
+ }
+
+ llbitmap->nr_pages = nr_pages;
+
+ for (i = 0; i < nr_pages; i++, pctl = (void *)pctl + size) {
+ struct page *page = llbitmap_read_page(llbitmap, i);
+
+ llbitmap->pctl[i] = pctl;
+
+ if (IS_ERR(page)) {
+ llbitmap_free_pages(llbitmap);
+ return PTR_ERR(page);
+ }
+
+ if (percpu_ref_init(&pctl->active, active_release,
+ PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
+ __free_page(page);
+ llbitmap_free_pages(llbitmap);
+ return -ENOMEM;
+ }
+
+ pctl->page = page;
+ pctl->state = page_address(page);
+ init_waitqueue_head(&pctl->wait);
+ }
+
+ return 0;
+}
+
+static void llbitmap_init_state(struct llbitmap *llbitmap)
+{
+ enum llbitmap_state state = BitUnwritten;
+ unsigned long i;
+
+ if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags))
+ state = BitClean;
+
+ for (i = 0; i < llbitmap->chunks; i++)
+ llbitmap_write(llbitmap, state, i);
+}
+
+/* The return value is only used from resync, where @start == @end. */
+static enum llbitmap_state llbitmap_state_machine(struct llbitmap *llbitmap,
+ unsigned long start,
+ unsigned long end,
+ enum llbitmap_action action)
+{
+ struct mddev *mddev = llbitmap->mddev;
+ enum llbitmap_state state = BitNone;
+ bool level_456 = raid_is_456(llbitmap->mddev);
+ bool need_resync = false;
+ bool need_recovery = false;
+
+ if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags))
+ return BitNone;
+
+ if (action == BitmapActionInit) {
+ llbitmap_init_state(llbitmap);
+ return BitNone;
+ }
+
+ while (start <= end) {
+ enum llbitmap_state c = llbitmap_read(llbitmap, start);
+
+ if (c < 0 || c >= BitStateCount) {
+ pr_err("%s: invalid bit %lu state %d action %d, forcing resync\n",
+ __func__, start, c, action);
+ state = BitNeedSync;
+ goto write_bitmap;
+ }
+
+ if (c == BitNeedSync)
+ need_resync = !mddev->degraded;
+
+ state = state_machine[c][action];
+
+write_bitmap:
+ if (unlikely(mddev->degraded)) {
+ /* For degraded array, mark new data as need sync. */
+ if (state == BitDirty &&
+ action == BitmapActionStartwrite)
+ state = BitNeedSync;
+ /*
+ * For degraded array, resync dirty data as well, noted
+ * if array is still degraded after resync is done, all
+ * new data will still be dirty until array is clean.
+ */
+ else if (c == BitDirty &&
+ action == BitmapActionStartsync)
+ state = BitSyncing;
+ } else if (c == BitUnwritten && state == BitDirty &&
+ action == BitmapActionStartwrite && level_456) {
+ /* Delay raid456 initial recovery to first write. */
+ state = BitNeedSync;
+ }
+
+ if (state == BitNone) {
+ start++;
+ continue;
+ }
+
+ llbitmap_write(llbitmap, state, start);
+
+ if (state == BitNeedSync)
+ need_resync = !mddev->degraded;
+ else if (state == BitDirty &&
+ !timer_pending(&llbitmap->pending_timer))
+ mod_timer(&llbitmap->pending_timer,
+ jiffies + mddev->bitmap_info.daemon_sleep * HZ);
+
+ start++;
+ }
+
+ if (need_resync && level_456)
+ need_recovery = true;
+
+ if (need_recovery) {
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ set_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ } else if (need_resync) {
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ }
+
+ return state;
+}
+
+static void llbitmap_raise_barrier(struct llbitmap *llbitmap, int page_idx)
+{
+ struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
+
+retry:
+ if (likely(percpu_ref_tryget_live(&pctl->active))) {
+ WRITE_ONCE(pctl->expire, jiffies + llbitmap->barrier_idle * HZ);
+ return;
+ }
+
+ wait_event(pctl->wait, !percpu_ref_is_dying(&pctl->active));
+ goto retry;
+}
+
+static void llbitmap_release_barrier(struct llbitmap *llbitmap, int page_idx)
+{
+ struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
+
+ percpu_ref_put(&pctl->active);
+}
+
+static int llbitmap_suspend_timeout(struct llbitmap *llbitmap, int page_idx)
+{
+ struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
+
+ percpu_ref_kill(&pctl->active);
+
+ if (!wait_event_timeout(pctl->wait, percpu_ref_is_zero(&pctl->active),
+ llbitmap->mddev->bitmap_info.daemon_sleep * HZ))
+ return -ETIMEDOUT;
+
+ return 0;
+}
+
+static void llbitmap_resume(struct llbitmap *llbitmap, int page_idx)
+{
+ struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
+
+ pctl->expire = LONG_MAX;
+ percpu_ref_resurrect(&pctl->active);
+ wake_up(&pctl->wait);
+}
+
+static int llbitmap_check_support(struct mddev *mddev)
+{
+ if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
+ pr_notice("md/llbitmap: %s: array with journal cannot have bitmap\n",
+ mdname(mddev));
+ return -EBUSY;
+ }
+
+ if (mddev->bitmap_info.space == 0) {
+ if (mddev->bitmap_info.default_space == 0) {
+ pr_notice("md/llbitmap: %s: no space for bitmap\n",
+ mdname(mddev));
+ return -ENOSPC;
+ }
+ }
+
+ if (!mddev->persistent) {
+ pr_notice("md/llbitmap: %s: array must be persistent\n",
+ mdname(mddev));
+ return -EOPNOTSUPP;
+ }
+
+ if (mddev->bitmap_info.file) {
+ pr_notice("md/llbitmap: %s: doesn't support bitmap file\n",
+ mdname(mddev));
+ return -EOPNOTSUPP;
+ }
+
+ if (mddev->bitmap_info.external) {
+ pr_notice("md/llbitmap: %s: doesn't support external metadata\n",
+ mdname(mddev));
+ return -EOPNOTSUPP;
+ }
+
+ if (mddev_is_dm(mddev)) {
+ pr_notice("md/llbitmap: %s: doesn't support dm-raid\n",
+ mdname(mddev));
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static int llbitmap_init(struct llbitmap *llbitmap)
+{
+ struct mddev *mddev = llbitmap->mddev;
+ sector_t blocks = mddev->resync_max_sectors;
+ unsigned long chunksize = MIN_CHUNK_SIZE;
+ unsigned long chunks = DIV_ROUND_UP(blocks, chunksize);
+ unsigned long space = mddev->bitmap_info.space << SECTOR_SHIFT;
+ int ret;
+
+ while (chunks > space) {
+ chunksize = chunksize << 1;
+ chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize);
+ }
+
+ llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE;
+ llbitmap->chunkshift = ffz(~chunksize);
+ llbitmap->chunksize = chunksize;
+ llbitmap->chunks = chunks;
+ mddev->bitmap_info.daemon_sleep = DEFAULT_DAEMON_SLEEP;
+
+ ret = llbitmap_cache_pages(llbitmap);
+ if (ret)
+ return ret;
+
+ llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
+ BitmapActionInit);
+ /* flush initial llbitmap to disk */
+ __llbitmap_flush(mddev);
+
+ return 0;
+}
+
+static int llbitmap_read_sb(struct llbitmap *llbitmap)
+{
+ struct mddev *mddev = llbitmap->mddev;
+ unsigned long daemon_sleep;
+ unsigned long chunksize;
+ unsigned long events;
+ struct page *sb_page;
+ bitmap_super_t *sb;
+ int ret = -EINVAL;
+
+ if (!mddev->bitmap_info.offset) {
+ pr_err("md/llbitmap: %s: no super block found", mdname(mddev));
+ return -EINVAL;
+ }
+
+ sb_page = llbitmap_read_page(llbitmap, 0);
+ if (IS_ERR(sb_page)) {
+ pr_err("md/llbitmap: %s: read super block failed",
+ mdname(mddev));
+ return -EIO;
+ }
+
+ sb = kmap_local_page(sb_page);
+ if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) {
+ pr_err("md/llbitmap: %s: invalid super block magic number",
+ mdname(mddev));
+ goto out_put_page;
+ }
+
+ if (sb->version != cpu_to_le32(BITMAP_MAJOR_LOCKLESS)) {
+ pr_err("md/llbitmap: %s: invalid super block version",
+ mdname(mddev));
+ goto out_put_page;
+ }
+
+ if (memcmp(sb->uuid, mddev->uuid, 16)) {
+ pr_err("md/llbitmap: %s: bitmap superblock UUID mismatch\n",
+ mdname(mddev));
+ goto out_put_page;
+ }
+
+ if (mddev->bitmap_info.space == 0) {
+ int room = le32_to_cpu(sb->sectors_reserved);
+
+ if (room)
+ mddev->bitmap_info.space = room;
+ else
+ mddev->bitmap_info.space = mddev->bitmap_info.default_space;
+ }
+ llbitmap->flags = le32_to_cpu(sb->state);
+ if (test_and_clear_bit(BITMAP_FIRST_USE, &llbitmap->flags)) {
+ ret = llbitmap_init(llbitmap);
+ goto out_put_page;
+ }
+
+ chunksize = le32_to_cpu(sb->chunksize);
+ if (!is_power_of_2(chunksize)) {
+ pr_err("md/llbitmap: %s: chunksize not a power of 2",
+ mdname(mddev));
+ goto out_put_page;
+ }
+
+ if (chunksize < DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors,
+ mddev->bitmap_info.space << SECTOR_SHIFT)) {
+ pr_err("md/llbitmap: %s: chunksize too small %lu < %llu / %lu",
+ mdname(mddev), chunksize, mddev->resync_max_sectors,
+ mddev->bitmap_info.space);
+ goto out_put_page;
+ }
+
+ daemon_sleep = le32_to_cpu(sb->daemon_sleep);
+ if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT / HZ) {
+ pr_err("md/llbitmap: %s: daemon sleep %lu period out of range",
+ mdname(mddev), daemon_sleep);
+ goto out_put_page;
+ }
+
+ events = le64_to_cpu(sb->events);
+ if (events < mddev->events) {
+ pr_warn("md/llbitmap :%s: bitmap file is out of date (%lu < %llu) -- forcing full recovery",
+ mdname(mddev), events, mddev->events);
+ set_bit(BITMAP_STALE, &llbitmap->flags);
+ }
+
+ sb->sync_size = cpu_to_le64(mddev->resync_max_sectors);
+ mddev->bitmap_info.chunksize = chunksize;
+ mddev->bitmap_info.daemon_sleep = daemon_sleep;
+
+ llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE;
+ llbitmap->chunksize = chunksize;
+ llbitmap->chunks = DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors, chunksize);
+ llbitmap->chunkshift = ffz(~chunksize);
+ ret = llbitmap_cache_pages(llbitmap);
+
+out_put_page:
+ __free_page(sb_page);
+ kunmap_local(sb);
+ return ret;
+}
+
+static void llbitmap_pending_timer_fn(struct timer_list *pending_timer)
+{
+ struct llbitmap *llbitmap =
+ container_of(pending_timer, struct llbitmap, pending_timer);
+
+ if (work_busy(&llbitmap->daemon_work)) {
+ pr_warn("md/llbitmap: %s daemon_work not finished in %lu seconds\n",
+ mdname(llbitmap->mddev),
+ llbitmap->mddev->bitmap_info.daemon_sleep);
+ set_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags);
+ return;
+ }
+
+ queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work);
+}
+
+static void md_llbitmap_daemon_fn(struct work_struct *work)
+{
+ struct llbitmap *llbitmap =
+ container_of(work, struct llbitmap, daemon_work);
+ unsigned long start;
+ unsigned long end;
+ bool restart;
+ int idx;
+
+ if (llbitmap->mddev->degraded)
+ return;
+retry:
+ start = 0;
+ end = min(llbitmap->chunks, PAGE_SIZE - BITMAP_DATA_OFFSET) - 1;
+ restart = false;
+
+ for (idx = 0; idx < llbitmap->nr_pages; idx++) {
+ struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
+
+ if (idx > 0) {
+ start = end + 1;
+ end = min(end + PAGE_SIZE, llbitmap->chunks - 1);
+ }
+
+ if (!test_bit(LLPageFlush, &pctl->flags) &&
+ time_before(jiffies, pctl->expire)) {
+ restart = true;
+ continue;
+ }
+
+ if (llbitmap_suspend_timeout(llbitmap, idx) < 0) {
+ pr_warn("md/llbitmap: %s: %s waiting for page %d timeout\n",
+ mdname(llbitmap->mddev), __func__, idx);
+ continue;
+ }
+
+ llbitmap_state_machine(llbitmap, start, end, BitmapActionDaemon);
+ llbitmap_resume(llbitmap, idx);
+ }
+
+ /*
+ * If the daemon took a long time to finish, retry to prevent missing
+ * clearing dirty bits.
+ */
+ if (test_and_clear_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags))
+ goto retry;
+
+ /* If some page is dirty but not expired, setup timer again */
+ if (restart)
+ mod_timer(&llbitmap->pending_timer,
+ jiffies + llbitmap->mddev->bitmap_info.daemon_sleep * HZ);
+}
+
+static int llbitmap_create(struct mddev *mddev)
+{
+ struct llbitmap *llbitmap;
+ int ret;
+
+ ret = llbitmap_check_support(mddev);
+ if (ret)
+ return ret;
+
+ llbitmap = kzalloc(sizeof(*llbitmap), GFP_KERNEL);
+ if (!llbitmap)
+ return -ENOMEM;
+
+ llbitmap->mddev = mddev;
+ llbitmap->io_size = bdev_logical_block_size(mddev->gendisk->part0);
+ llbitmap->blocks_per_page = PAGE_SIZE / llbitmap->io_size;
+
+ timer_setup(&llbitmap->pending_timer, llbitmap_pending_timer_fn, 0);
+ INIT_WORK(&llbitmap->daemon_work, md_llbitmap_daemon_fn);
+ atomic_set(&llbitmap->behind_writes, 0);
+ init_waitqueue_head(&llbitmap->behind_wait);
+
+ mutex_lock(&mddev->bitmap_info.mutex);
+ mddev->bitmap = llbitmap;
+ ret = llbitmap_read_sb(llbitmap);
+ mutex_unlock(&mddev->bitmap_info.mutex);
+ if (ret) {
+ kfree(llbitmap);
+ mddev->bitmap = NULL;
+ }
+
+ return ret;
+}
+
+static int llbitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ unsigned long chunks;
+
+ if (chunksize == 0)
+ chunksize = llbitmap->chunksize;
+
+ /* If there is enough space, leave the chunksize unchanged. */
+ chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize);
+ while (chunks > mddev->bitmap_info.space << SECTOR_SHIFT) {
+ chunksize = chunksize << 1;
+ chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize);
+ }
+
+ llbitmap->chunkshift = ffz(~chunksize);
+ llbitmap->chunksize = chunksize;
+ llbitmap->chunks = chunks;
+
+ return 0;
+}
+
+static int llbitmap_load(struct mddev *mddev)
+{
+ enum llbitmap_action action = BitmapActionReload;
+ struct llbitmap *llbitmap = mddev->bitmap;
+
+ if (test_and_clear_bit(BITMAP_STALE, &llbitmap->flags))
+ action = BitmapActionStale;
+
+ llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, action);
+ return 0;
+}
+
+static void llbitmap_destroy(struct mddev *mddev)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+
+ if (!llbitmap)
+ return;
+
+ mutex_lock(&mddev->bitmap_info.mutex);
+
+ timer_delete_sync(&llbitmap->pending_timer);
+ flush_workqueue(md_llbitmap_io_wq);
+ flush_workqueue(md_llbitmap_unplug_wq);
+
+ mddev->bitmap = NULL;
+ llbitmap_free_pages(llbitmap);
+ kfree(llbitmap);
+ mutex_unlock(&mddev->bitmap_info.mutex);
+}
+
+static void llbitmap_start_write(struct mddev *mddev, sector_t offset,
+ unsigned long sectors)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ unsigned long start = offset >> llbitmap->chunkshift;
+ unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
+ int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
+ int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
+
+ llbitmap_state_machine(llbitmap, start, end, BitmapActionStartwrite);
+
+ while (page_start <= page_end) {
+ llbitmap_raise_barrier(llbitmap, page_start);
+ page_start++;
+ }
+}
+
+static void llbitmap_end_write(struct mddev *mddev, sector_t offset,
+ unsigned long sectors)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ unsigned long start = offset >> llbitmap->chunkshift;
+ unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
+ int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
+ int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
+
+ while (page_start <= page_end) {
+ llbitmap_release_barrier(llbitmap, page_start);
+ page_start++;
+ }
+}
+
+static void llbitmap_start_discard(struct mddev *mddev, sector_t offset,
+ unsigned long sectors)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize);
+ unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
+ int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
+ int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
+
+ llbitmap_state_machine(llbitmap, start, end, BitmapActionDiscard);
+
+ while (page_start <= page_end) {
+ llbitmap_raise_barrier(llbitmap, page_start);
+ page_start++;
+ }
+}
+
+static void llbitmap_end_discard(struct mddev *mddev, sector_t offset,
+ unsigned long sectors)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize);
+ unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
+ int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
+ int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
+
+ while (page_start <= page_end) {
+ llbitmap_release_barrier(llbitmap, page_start);
+ page_start++;
+ }
+}
+
+static void llbitmap_unplug_fn(struct work_struct *work)
+{
+ struct llbitmap_unplug_work *unplug_work =
+ container_of(work, struct llbitmap_unplug_work, work);
+ struct llbitmap *llbitmap = unplug_work->llbitmap;
+ struct blk_plug plug;
+ int i;
+
+ blk_start_plug(&plug);
+
+ for (i = 0; i < llbitmap->nr_pages; i++) {
+ if (!test_bit(LLPageDirty, &llbitmap->pctl[i]->flags) ||
+ !test_and_clear_bit(LLPageDirty, &llbitmap->pctl[i]->flags))
+ continue;
+
+ llbitmap_write_page(llbitmap, i);
+ }
+
+ blk_finish_plug(&plug);
+ md_super_wait(llbitmap->mddev);
+ complete(unplug_work->done);
+}
+
+static bool llbitmap_dirty(struct llbitmap *llbitmap)
+{
+ int i;
+
+ for (i = 0; i < llbitmap->nr_pages; i++)
+ if (test_bit(LLPageDirty, &llbitmap->pctl[i]->flags))
+ return true;
+
+ return false;
+}
+
+static void llbitmap_unplug(struct mddev *mddev, bool sync)
+{
+ DECLARE_COMPLETION_ONSTACK(done);
+ struct llbitmap *llbitmap = mddev->bitmap;
+ struct llbitmap_unplug_work unplug_work = {
+ .llbitmap = llbitmap,
+ .done = &done,
+ };
+
+ if (!llbitmap_dirty(llbitmap))
+ return;
+
+ /*
+ * Issue new bitmap IO under submit_bio() context will deadlock:
+ * - the bio will wait for bitmap bio to be done, before it can be
+ * issued;
+ * - bitmap bio will be added to current->bio_list and wait for this
+ * bio to be issued;
+ */
+ INIT_WORK_ONSTACK(&unplug_work.work, llbitmap_unplug_fn);
+ queue_work(md_llbitmap_unplug_wq, &unplug_work.work);
+ wait_for_completion(&done);
+ destroy_work_on_stack(&unplug_work.work);
+}
+
+/*
+ * Force to write all bitmap pages to disk, called when stopping the array, or
+ * every daemon_sleep seconds when sync_thread is running.
+ */
+static void __llbitmap_flush(struct mddev *mddev)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ struct blk_plug plug;
+ int i;
+
+ blk_start_plug(&plug);
+ for (i = 0; i < llbitmap->nr_pages; i++) {
+ struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
+
+ /* mark all blocks as dirty */
+ set_bit(LLPageDirty, &pctl->flags);
+ bitmap_fill(pctl->dirty, llbitmap->blocks_per_page);
+ llbitmap_write_page(llbitmap, i);
+ }
+ blk_finish_plug(&plug);
+ md_super_wait(llbitmap->mddev);
+}
+
+static void llbitmap_flush(struct mddev *mddev)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ int i;
+
+ for (i = 0; i < llbitmap->nr_pages; i++)
+ set_bit(LLPageFlush, &llbitmap->pctl[i]->flags);
+
+ timer_delete_sync(&llbitmap->pending_timer);
+ queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work);
+ flush_work(&llbitmap->daemon_work);
+
+ __llbitmap_flush(mddev);
+}
+
+/* This is used for raid5 lazy initial recovery */
+static bool llbitmap_blocks_synced(struct mddev *mddev, sector_t offset)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ unsigned long p = offset >> llbitmap->chunkshift;
+ enum llbitmap_state c = llbitmap_read(llbitmap, p);
+
+ return c == BitClean || c == BitDirty;
+}
+
+static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ unsigned long p = offset >> llbitmap->chunkshift;
+ int blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
+ enum llbitmap_state c = llbitmap_read(llbitmap, p);
+
+ /* always skip unwritten blocks */
+ if (c == BitUnwritten)
+ return blocks;
+
+ /* For degraded array, don't skip */
+ if (mddev->degraded)
+ return 0;
+
+ /* For resync also skip clean/dirty blocks */
+ if ((c == BitClean || c == BitDirty) &&
+ test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+ return blocks;
+
+ return 0;
+}
+
+static bool llbitmap_start_sync(struct mddev *mddev, sector_t offset,
+ sector_t *blocks, bool degraded)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ unsigned long p = offset >> llbitmap->chunkshift;
+
+ /*
+ * Handle one bit at a time, this is much simpler. And it doesn't matter
+ * if md_do_sync() loop more times.
+ */
+ *blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
+ return llbitmap_state_machine(llbitmap, p, p,
+ BitmapActionStartsync) == BitSyncing;
+}
+
+/* Something is wrong, sync_thread stop at @offset */
+static void llbitmap_end_sync(struct mddev *mddev, sector_t offset,
+ sector_t *blocks)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ unsigned long p = offset >> llbitmap->chunkshift;
+
+ *blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
+ llbitmap_state_machine(llbitmap, p, llbitmap->chunks - 1,
+ BitmapActionAbortsync);
+}
+
+/* A full sync_thread is finished */
+static void llbitmap_close_sync(struct mddev *mddev)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ int i;
+
+ for (i = 0; i < llbitmap->nr_pages; i++) {
+ struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
+
+ /* let daemon_fn clear dirty bits immediately */
+ WRITE_ONCE(pctl->expire, jiffies);
+ }
+
+ llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
+ BitmapActionEndsync);
+}
+
+/*
+ * sync_thread have reached @sector, update metadata every daemon_sleep seconds,
+ * just in case sync_thread have to restart after power failure.
+ */
+static void llbitmap_cond_end_sync(struct mddev *mddev, sector_t sector,
+ bool force)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+
+ if (sector == 0) {
+ llbitmap->last_end_sync = jiffies;
+ return;
+ }
+
+ if (time_before(jiffies, llbitmap->last_end_sync +
+ HZ * mddev->bitmap_info.daemon_sleep))
+ return;
+
+ wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
+
+ mddev->curr_resync_completed = sector;
+ set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
+ llbitmap_state_machine(llbitmap, 0, sector >> llbitmap->chunkshift,
+ BitmapActionEndsync);
+ __llbitmap_flush(mddev);
+
+ llbitmap->last_end_sync = jiffies;
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
+}
+
+static bool llbitmap_enabled(void *data, bool flush)
+{
+ struct llbitmap *llbitmap = data;
+
+ return llbitmap && !test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags);
+}
+
+static void llbitmap_dirty_bits(struct mddev *mddev, unsigned long s,
+ unsigned long e)
+{
+ llbitmap_state_machine(mddev->bitmap, s, e, BitmapActionStartwrite);
+}
+
+static void llbitmap_write_sb(struct llbitmap *llbitmap)
+{
+ int nr_blocks = DIV_ROUND_UP(BITMAP_DATA_OFFSET, llbitmap->io_size);
+
+ bitmap_fill(llbitmap->pctl[0]->dirty, nr_blocks);
+ llbitmap_write_page(llbitmap, 0);
+ md_super_wait(llbitmap->mddev);
+}
+
+static void llbitmap_update_sb(void *data)
+{
+ struct llbitmap *llbitmap = data;
+ struct mddev *mddev = llbitmap->mddev;
+ struct page *sb_page;
+ bitmap_super_t *sb;
+
+ if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags))
+ return;
+
+ sb_page = llbitmap_read_page(llbitmap, 0);
+ if (IS_ERR(sb_page)) {
+ pr_err("%s: %s: read super block failed", __func__,
+ mdname(mddev));
+ set_bit(BITMAP_WRITE_ERROR, &llbitmap->flags);
+ return;
+ }
+
+ if (mddev->events < llbitmap->events_cleared)
+ llbitmap->events_cleared = mddev->events;
+
+ sb = kmap_local_page(sb_page);
+ sb->events = cpu_to_le64(mddev->events);
+ sb->state = cpu_to_le32(llbitmap->flags);
+ sb->chunksize = cpu_to_le32(llbitmap->chunksize);
+ sb->sync_size = cpu_to_le64(mddev->resync_max_sectors);
+ sb->events_cleared = cpu_to_le64(llbitmap->events_cleared);
+ sb->sectors_reserved = cpu_to_le32(mddev->bitmap_info.space);
+ sb->daemon_sleep = cpu_to_le32(mddev->bitmap_info.daemon_sleep);
+
+ kunmap_local(sb);
+ llbitmap_write_sb(llbitmap);
+}
+
+static int llbitmap_get_stats(void *data, struct md_bitmap_stats *stats)
+{
+ struct llbitmap *llbitmap = data;
+
+ memset(stats, 0, sizeof(*stats));
+
+ stats->missing_pages = 0;
+ stats->pages = llbitmap->nr_pages;
+ stats->file_pages = llbitmap->nr_pages;
+
+ stats->behind_writes = atomic_read(&llbitmap->behind_writes);
+ stats->behind_wait = wq_has_sleeper(&llbitmap->behind_wait);
+ stats->events_cleared = llbitmap->events_cleared;
+
+ return 0;
+}
+
+/* just flag all pages as needing to be written */
+static void llbitmap_write_all(struct mddev *mddev)
+{
+ int i;
+ struct llbitmap *llbitmap = mddev->bitmap;
+
+ for (i = 0; i < llbitmap->nr_pages; i++) {
+ struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
+
+ set_bit(LLPageDirty, &pctl->flags);
+ bitmap_fill(pctl->dirty, llbitmap->blocks_per_page);
+ }
+}
+
+static void llbitmap_start_behind_write(struct mddev *mddev)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+
+ atomic_inc(&llbitmap->behind_writes);
+}
+
+static void llbitmap_end_behind_write(struct mddev *mddev)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+
+ if (atomic_dec_and_test(&llbitmap->behind_writes))
+ wake_up(&llbitmap->behind_wait);
+}
+
+static void llbitmap_wait_behind_writes(struct mddev *mddev)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+
+ if (!llbitmap)
+ return;
+
+ wait_event(llbitmap->behind_wait,
+ atomic_read(&llbitmap->behind_writes) == 0);
+
+}
+
+static ssize_t bits_show(struct mddev *mddev, char *page)
+{
+ struct llbitmap *llbitmap;
+ int bits[BitStateCount] = {0};
+ loff_t start = 0;
+
+ mutex_lock(&mddev->bitmap_info.mutex);
+ llbitmap = mddev->bitmap;
+ if (!llbitmap || !llbitmap->pctl) {
+ mutex_unlock(&mddev->bitmap_info.mutex);
+ return sprintf(page, "no bitmap\n");
+ }
+
+ if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags)) {
+ mutex_unlock(&mddev->bitmap_info.mutex);
+ return sprintf(page, "bitmap io error\n");
+ }
+
+ while (start < llbitmap->chunks) {
+ enum llbitmap_state c = llbitmap_read(llbitmap, start);
+
+ if (c < 0 || c >= BitStateCount)
+ pr_err("%s: invalid bit %llu state %d\n",
+ __func__, start, c);
+ else
+ bits[c]++;
+ start++;
+ }
+
+ mutex_unlock(&mddev->bitmap_info.mutex);
+ return sprintf(page, "unwritten %d\nclean %d\ndirty %d\nneed sync %d\nsyncing %d\n",
+ bits[BitUnwritten], bits[BitClean], bits[BitDirty],
+ bits[BitNeedSync], bits[BitSyncing]);
+}
+
+static struct md_sysfs_entry llbitmap_bits = __ATTR_RO(bits);
+
+static ssize_t metadata_show(struct mddev *mddev, char *page)
+{
+ struct llbitmap *llbitmap;
+ ssize_t ret;
+
+ mutex_lock(&mddev->bitmap_info.mutex);
+ llbitmap = mddev->bitmap;
+ if (!llbitmap) {
+ mutex_unlock(&mddev->bitmap_info.mutex);
+ return sprintf(page, "no bitmap\n");
+ }
+
+ ret = sprintf(page, "chunksize %lu\nchunkshift %lu\nchunks %lu\noffset %llu\ndaemon_sleep %lu\n",
+ llbitmap->chunksize, llbitmap->chunkshift,
+ llbitmap->chunks, mddev->bitmap_info.offset,
+ llbitmap->mddev->bitmap_info.daemon_sleep);
+ mutex_unlock(&mddev->bitmap_info.mutex);
+
+ return ret;
+}
+
+static struct md_sysfs_entry llbitmap_metadata = __ATTR_RO(metadata);
+
+static ssize_t
+daemon_sleep_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%lu\n", mddev->bitmap_info.daemon_sleep);
+}
+
+static ssize_t
+daemon_sleep_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ unsigned long timeout;
+ int rv = kstrtoul(buf, 10, &timeout);
+
+ if (rv)
+ return rv;
+
+ mddev->bitmap_info.daemon_sleep = timeout;
+ return len;
+}
+
+static struct md_sysfs_entry llbitmap_daemon_sleep = __ATTR_RW(daemon_sleep);
+
+static ssize_t
+barrier_idle_show(struct mddev *mddev, char *page)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+
+ return sprintf(page, "%lu\n", llbitmap->barrier_idle);
+}
+
+static ssize_t
+barrier_idle_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ unsigned long timeout;
+ int rv = kstrtoul(buf, 10, &timeout);
+
+ if (rv)
+ return rv;
+
+ llbitmap->barrier_idle = timeout;
+ return len;
+}
+
+static struct md_sysfs_entry llbitmap_barrier_idle = __ATTR_RW(barrier_idle);
+
+static struct attribute *md_llbitmap_attrs[] = {
+ &llbitmap_bits.attr,
+ &llbitmap_metadata.attr,
+ &llbitmap_daemon_sleep.attr,
+ &llbitmap_barrier_idle.attr,
+ NULL
+};
+
+static struct attribute_group md_llbitmap_group = {
+ .name = "llbitmap",
+ .attrs = md_llbitmap_attrs,
+};
+
+static struct bitmap_operations llbitmap_ops = {
+ .head = {
+ .type = MD_BITMAP,
+ .id = ID_LLBITMAP,
+ .name = "llbitmap",
+ },
+
+ .enabled = llbitmap_enabled,
+ .create = llbitmap_create,
+ .resize = llbitmap_resize,
+ .load = llbitmap_load,
+ .destroy = llbitmap_destroy,
+
+ .start_write = llbitmap_start_write,
+ .end_write = llbitmap_end_write,
+ .start_discard = llbitmap_start_discard,
+ .end_discard = llbitmap_end_discard,
+ .unplug = llbitmap_unplug,
+ .flush = llbitmap_flush,
+
+ .start_behind_write = llbitmap_start_behind_write,
+ .end_behind_write = llbitmap_end_behind_write,
+ .wait_behind_writes = llbitmap_wait_behind_writes,
+
+ .blocks_synced = llbitmap_blocks_synced,
+ .skip_sync_blocks = llbitmap_skip_sync_blocks,
+ .start_sync = llbitmap_start_sync,
+ .end_sync = llbitmap_end_sync,
+ .close_sync = llbitmap_close_sync,
+ .cond_end_sync = llbitmap_cond_end_sync,
+
+ .update_sb = llbitmap_update_sb,
+ .get_stats = llbitmap_get_stats,
+ .dirty_bits = llbitmap_dirty_bits,
+ .write_all = llbitmap_write_all,
+
+ .group = &md_llbitmap_group,
+};
+
+int md_llbitmap_init(void)
+{
+ md_llbitmap_io_wq = alloc_workqueue("md_llbitmap_io",
+ WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
+ if (!md_llbitmap_io_wq)
+ return -ENOMEM;
+
+ md_llbitmap_unplug_wq = alloc_workqueue("md_llbitmap_unplug",
+ WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
+ if (!md_llbitmap_unplug_wq) {
+ destroy_workqueue(md_llbitmap_io_wq);
+ md_llbitmap_io_wq = NULL;
+ return -ENOMEM;
+ }
+
+ return register_md_submodule(&llbitmap_ops.head);
+}
+
+void md_llbitmap_exit(void)
+{
+ destroy_workqueue(md_llbitmap_io_wq);
+ md_llbitmap_io_wq = NULL;
+ destroy_workqueue(md_llbitmap_unplug_wq);
+ md_llbitmap_unplug_wq = NULL;
+ unregister_md_submodule(&llbitmap_ops.head);
+}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1baaf52c603c..1de550108756 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -94,7 +94,6 @@ static struct workqueue_struct *md_wq;
* workqueue whith reconfig_mutex grabbed.
*/
static struct workqueue_struct *md_misc_wq;
-struct workqueue_struct *md_bitmap_wq;
static int remove_and_add_spares(struct mddev *mddev,
struct md_rdev *this);
@@ -677,8 +676,64 @@ static void active_io_release(struct percpu_ref *ref)
static void no_op(struct percpu_ref *r) {}
+static bool mddev_set_bitmap_ops(struct mddev *mddev)
+{
+ struct bitmap_operations *old = mddev->bitmap_ops;
+ struct md_submodule_head *head;
+
+ if (mddev->bitmap_id == ID_BITMAP_NONE ||
+ (old && old->head.id == mddev->bitmap_id))
+ return true;
+
+ xa_lock(&md_submodule);
+ head = xa_load(&md_submodule, mddev->bitmap_id);
+
+ if (!head) {
+ pr_warn("md: can't find bitmap id %d\n", mddev->bitmap_id);
+ goto err;
+ }
+
+ if (head->type != MD_BITMAP) {
+ pr_warn("md: invalid bitmap id %d\n", mddev->bitmap_id);
+ goto err;
+ }
+
+ mddev->bitmap_ops = (void *)head;
+ xa_unlock(&md_submodule);
+
+ if (!mddev_is_dm(mddev) && mddev->bitmap_ops->group) {
+ if (sysfs_create_group(&mddev->kobj, mddev->bitmap_ops->group))
+ pr_warn("md: cannot register extra bitmap attributes for %s\n",
+ mdname(mddev));
+ else
+ /*
+ * Inform user with KOBJ_CHANGE about new bitmap
+ * attributes.
+ */
+ kobject_uevent(&mddev->kobj, KOBJ_CHANGE);
+ }
+ return true;
+
+err:
+ xa_unlock(&md_submodule);
+ return false;
+}
+
+static void mddev_clear_bitmap_ops(struct mddev *mddev)
+{
+ if (!mddev_is_dm(mddev) && mddev->bitmap_ops &&
+ mddev->bitmap_ops->group)
+ sysfs_remove_group(&mddev->kobj, mddev->bitmap_ops->group);
+
+ mddev->bitmap_ops = NULL;
+}
+
int mddev_init(struct mddev *mddev)
{
+ if (!IS_ENABLED(CONFIG_MD_BITMAP))
+ mddev->bitmap_id = ID_BITMAP_NONE;
+ else
+ mddev->bitmap_id = ID_BITMAP;
if (percpu_ref_init(&mddev->active_io, active_io_release,
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
@@ -713,7 +768,6 @@ int mddev_init(struct mddev *mddev)
mddev->resync_min = 0;
mddev->resync_max = MaxSector;
mddev->level = LEVEL_NONE;
- mddev_set_bitmap_ops(mddev);
INIT_WORK(&mddev->sync_work, md_start_sync);
INIT_WORK(&mddev->del_work, mddev_delayed_delete);
@@ -1020,15 +1074,26 @@ static void super_written(struct bio *bio)
wake_up(&mddev->sb_wait);
}
-void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
- sector_t sector, int size, struct page *page)
+/**
+ * md_write_metadata - write metadata to underlying disk, including
+ * array superblock, badblocks, bitmap superblock and bitmap bits.
+ * @mddev: the array to write
+ * @rdev: the underlying disk to write
+ * @sector: the offset to @rdev
+ * @size: the length of the metadata
+ * @page: the metadata
+ * @offset: the offset to @page
+ *
+ * Write @size bytes of @page start from @offset, to @sector of @rdev, Increment
+ * mddev->pending_writes before returning, and decrement it on completion,
+ * waking up sb_wait. Caller must call md_super_wait() after issuing io to all
+ * rdev. If an error occurred, md_error() will be called, and the @rdev will be
+ * kicked out from @mddev.
+ */
+void md_write_metadata(struct mddev *mddev, struct md_rdev *rdev,
+ sector_t sector, int size, struct page *page,
+ unsigned int offset)
{
- /* write first size bytes of page to sector of rdev
- * Increment mddev->pending_writes before returning
- * and decrement it on completion, waking up sb_wait
- * if zero is reached.
- * If an error occurred, call md_error
- */
struct bio *bio;
if (!page)
@@ -1046,7 +1111,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
atomic_inc(&rdev->nr_pending);
bio->bi_iter.bi_sector = sector;
- __bio_add_page(bio, page, size, 0);
+ __bio_add_page(bio, page, size, offset);
bio->bi_private = rdev;
bio->bi_end_io = super_written;
@@ -1356,6 +1421,9 @@ static u64 md_bitmap_events_cleared(struct mddev *mddev)
struct md_bitmap_stats stats;
int err;
+ if (!md_bitmap_enabled(mddev, false))
+ return 0;
+
err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
if (err)
return 0;
@@ -1653,8 +1721,8 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
num_sectors = (sector_t)(2ULL << 32) - 2;
do {
- md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
- rdev->sb_page);
+ md_write_metadata(rdev->mddev, rdev, rdev->sb_start,
+ rdev->sb_size, rdev->sb_page, 0);
} while (md_super_wait(rdev->mddev) < 0);
return num_sectors;
}
@@ -2302,8 +2370,8 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
sb->super_offset = cpu_to_le64(rdev->sb_start);
sb->sb_csum = calc_sb_1_csum(sb);
do {
- md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
- rdev->sb_page);
+ md_write_metadata(rdev->mddev, rdev, rdev->sb_start,
+ rdev->sb_size, rdev->sb_page, 0);
} while (md_super_wait(rdev->mddev) < 0);
return num_sectors;
@@ -2313,13 +2381,15 @@ static int
super_1_allow_new_offset(struct md_rdev *rdev,
unsigned long long new_offset)
{
+ struct mddev *mddev = rdev->mddev;
+
/* All necessary checks on new >= old have been done */
if (new_offset >= rdev->data_offset)
return 1;
/* with 1.0 metadata, there is no metadata to tread on
* so we can always move back */
- if (rdev->mddev->minor_version == 0)
+ if (mddev->minor_version == 0)
return 1;
/* otherwise we must be sure not to step on
@@ -2331,8 +2401,7 @@ super_1_allow_new_offset(struct md_rdev *rdev,
if (rdev->sb_start + (32+4)*2 > new_offset)
return 0;
- if (!rdev->mddev->bitmap_info.file) {
- struct mddev *mddev = rdev->mddev;
+ if (md_bitmap_registered(mddev) && !mddev->bitmap_info.file) {
struct md_bitmap_stats stats;
int err;
@@ -2804,24 +2873,24 @@ repeat:
mddev_add_trace_msg(mddev, "md md_update_sb");
rewrite:
- mddev->bitmap_ops->update_sb(mddev->bitmap);
+ if (md_bitmap_enabled(mddev, false))
+ mddev->bitmap_ops->update_sb(mddev->bitmap);
rdev_for_each(rdev, mddev) {
if (rdev->sb_loaded != 1)
continue; /* no noise on spare devices */
if (!test_bit(Faulty, &rdev->flags)) {
- md_super_write(mddev,rdev,
- rdev->sb_start, rdev->sb_size,
- rdev->sb_page);
+ md_write_metadata(mddev, rdev, rdev->sb_start,
+ rdev->sb_size, rdev->sb_page, 0);
pr_debug("md: (write) %pg's sb offset: %llu\n",
rdev->bdev,
(unsigned long long)rdev->sb_start);
rdev->sb_events = mddev->events;
if (rdev->badblocks.size) {
- md_super_write(mddev, rdev,
- rdev->badblocks.sector,
- rdev->badblocks.size << 9,
- rdev->bb_page);
+ md_write_metadata(mddev, rdev,
+ rdev->badblocks.sector,
+ rdev->badblocks.size << 9,
+ rdev->bb_page, 0);
rdev->badblocks.size = 0;
}
@@ -4150,6 +4219,86 @@ static struct md_sysfs_entry md_new_level =
__ATTR(new_level, 0664, new_level_show, new_level_store);
static ssize_t
+bitmap_type_show(struct mddev *mddev, char *page)
+{
+ struct md_submodule_head *head;
+ unsigned long i;
+ ssize_t len = 0;
+
+ if (mddev->bitmap_id == ID_BITMAP_NONE)
+ len += sprintf(page + len, "[none] ");
+ else
+ len += sprintf(page + len, "none ");
+
+ xa_lock(&md_submodule);
+ xa_for_each(&md_submodule, i, head) {
+ if (head->type != MD_BITMAP)
+ continue;
+
+ if (mddev->bitmap_id == head->id)
+ len += sprintf(page + len, "[%s] ", head->name);
+ else
+ len += sprintf(page + len, "%s ", head->name);
+ }
+ xa_unlock(&md_submodule);
+
+ len += sprintf(page + len, "\n");
+ return len;
+}
+
+static ssize_t
+bitmap_type_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ struct md_submodule_head *head;
+ enum md_submodule_id id;
+ unsigned long i;
+ int err = 0;
+
+ xa_lock(&md_submodule);
+
+ if (mddev->bitmap_ops) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ if (cmd_match(buf, "none")) {
+ mddev->bitmap_id = ID_BITMAP_NONE;
+ goto out;
+ }
+
+ xa_for_each(&md_submodule, i, head) {
+ if (head->type == MD_BITMAP && cmd_match(buf, head->name)) {
+ mddev->bitmap_id = head->id;
+ goto out;
+ }
+ }
+
+ err = kstrtoint(buf, 10, &id);
+ if (err)
+ goto out;
+
+ if (id == ID_BITMAP_NONE) {
+ mddev->bitmap_id = id;
+ goto out;
+ }
+
+ head = xa_load(&md_submodule, id);
+ if (head && head->type == MD_BITMAP) {
+ mddev->bitmap_id = id;
+ goto out;
+ }
+
+ err = -ENOENT;
+
+out:
+ xa_unlock(&md_submodule);
+ return err ? err : len;
+}
+
+static struct md_sysfs_entry md_bitmap_type =
+__ATTR(bitmap_type, 0664, bitmap_type_show, bitmap_type_store);
+
+static ssize_t
layout_show(struct mddev *mddev, char *page)
{
/* just a number, not meaningful for all levels */
@@ -4680,6 +4829,9 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len)
unsigned long chunk, end_chunk;
int err;
+ if (!md_bitmap_enabled(mddev, false))
+ return len;
+
err = mddev_lock(mddev);
if (err)
return err;
@@ -5752,6 +5904,7 @@ __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show,
static struct attribute *md_default_attrs[] = {
&md_level.attr,
&md_new_level.attr,
+ &md_bitmap_type.attr,
&md_layout.attr,
&md_raid_disks.attr,
&md_uuid.attr,
@@ -5801,7 +5954,6 @@ static const struct attribute_group md_redundancy_group = {
static const struct attribute_group *md_attr_groups[] = {
&md_default_group,
- &md_bitmap_group,
NULL,
};
@@ -6133,6 +6285,26 @@ static void md_safemode_timeout(struct timer_list *t)
static int start_dirty_degraded;
+static int md_bitmap_create(struct mddev *mddev)
+{
+ if (mddev->bitmap_id == ID_BITMAP_NONE)
+ return -EINVAL;
+
+ if (!mddev_set_bitmap_ops(mddev))
+ return -ENOENT;
+
+ return mddev->bitmap_ops->create(mddev);
+}
+
+static void md_bitmap_destroy(struct mddev *mddev)
+{
+ if (!md_bitmap_registered(mddev))
+ return;
+
+ mddev->bitmap_ops->destroy(mddev);
+ mddev_clear_bitmap_ops(mddev);
+}
+
int md_run(struct mddev *mddev)
{
int err;
@@ -6299,7 +6471,7 @@ int md_run(struct mddev *mddev)
}
if (err == 0 && pers->sync_request &&
(mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
- err = mddev->bitmap_ops->create(mddev);
+ err = md_bitmap_create(mddev);
if (err)
pr_warn("%s: failed to create bitmap (%d)\n",
mdname(mddev), err);
@@ -6372,7 +6544,7 @@ bitmap_abort:
pers->free(mddev, mddev->private);
mddev->private = NULL;
put_pers(pers);
- mddev->bitmap_ops->destroy(mddev);
+ md_bitmap_destroy(mddev);
abort:
bioset_exit(&mddev->io_clone_set);
exit_sync_set:
@@ -6392,10 +6564,12 @@ int do_md_run(struct mddev *mddev)
if (err)
goto out;
- err = mddev->bitmap_ops->load(mddev);
- if (err) {
- mddev->bitmap_ops->destroy(mddev);
- goto out;
+ if (md_bitmap_registered(mddev)) {
+ err = mddev->bitmap_ops->load(mddev);
+ if (err) {
+ md_bitmap_destroy(mddev);
+ goto out;
+ }
}
if (mddev_is_clustered(mddev))
@@ -6546,7 +6720,8 @@ static void __md_stop_writes(struct mddev *mddev)
mddev->pers->quiesce(mddev, 0);
}
- mddev->bitmap_ops->flush(mddev);
+ if (md_bitmap_enabled(mddev, true))
+ mddev->bitmap_ops->flush(mddev);
if (md_is_rdwr(mddev) &&
((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
@@ -6573,7 +6748,8 @@ EXPORT_SYMBOL_GPL(md_stop_writes);
static void mddev_detach(struct mddev *mddev)
{
- mddev->bitmap_ops->wait_behind_writes(mddev);
+ if (md_bitmap_enabled(mddev, false))
+ mddev->bitmap_ops->wait_behind_writes(mddev);
if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) {
mddev->pers->quiesce(mddev, 1);
mddev->pers->quiesce(mddev, 0);
@@ -6589,7 +6765,7 @@ static void __md_stop(struct mddev *mddev)
{
struct md_personality *pers = mddev->pers;
- mddev->bitmap_ops->destroy(mddev);
+ md_bitmap_destroy(mddev);
mddev_detach(mddev);
spin_lock(&mddev->lock);
mddev->pers = NULL;
@@ -7307,6 +7483,9 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
{
int err = 0;
+ if (!md_bitmap_registered(mddev))
+ return -EINVAL;
+
if (mddev->pers) {
if (!mddev->pers->quiesce || !mddev->thread)
return -EBUSY;
@@ -7363,16 +7542,16 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
err = 0;
if (mddev->pers) {
if (fd >= 0) {
- err = mddev->bitmap_ops->create(mddev);
+ err = md_bitmap_create(mddev);
if (!err)
err = mddev->bitmap_ops->load(mddev);
if (err) {
- mddev->bitmap_ops->destroy(mddev);
+ md_bitmap_destroy(mddev);
fd = -1;
}
} else if (fd < 0) {
- mddev->bitmap_ops->destroy(mddev);
+ md_bitmap_destroy(mddev);
}
}
@@ -7679,12 +7858,12 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
mddev->bitmap_info.default_offset;
mddev->bitmap_info.space =
mddev->bitmap_info.default_space;
- rv = mddev->bitmap_ops->create(mddev);
+ rv = md_bitmap_create(mddev);
if (!rv)
rv = mddev->bitmap_ops->load(mddev);
if (rv)
- mddev->bitmap_ops->destroy(mddev);
+ md_bitmap_destroy(mddev);
} else {
struct md_bitmap_stats stats;
@@ -7710,7 +7889,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
put_cluster_ops(mddev);
mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
}
- mddev->bitmap_ops->destroy(mddev);
+ md_bitmap_destroy(mddev);
mddev->bitmap_info.offset = 0;
}
}
@@ -7747,9 +7926,9 @@ static int set_disk_faulty(struct mddev *mddev, dev_t dev)
* 4 sectors (with a BIG number of cylinders...). This drives
* dosfs just mad... ;-)
*/
-static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+static int md_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
- struct mddev *mddev = bdev->bd_disk->private_data;
+ struct mddev *mddev = disk->private_data;
geo->heads = 2;
geo->sectors = 4;
@@ -8491,6 +8670,9 @@ static void md_bitmap_status(struct seq_file *seq, struct mddev *mddev)
unsigned long chunk_kb;
int err;
+ if (!md_bitmap_enabled(mddev, false))
+ return;
+
err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
if (err)
return;
@@ -8873,18 +9055,24 @@ EXPORT_SYMBOL_GPL(md_submit_discard_bio);
static void md_bitmap_start(struct mddev *mddev,
struct md_io_clone *md_io_clone)
{
+ md_bitmap_fn *fn = unlikely(md_io_clone->rw == STAT_DISCARD) ?
+ mddev->bitmap_ops->start_discard :
+ mddev->bitmap_ops->start_write;
+
if (mddev->pers->bitmap_sector)
mddev->pers->bitmap_sector(mddev, &md_io_clone->offset,
&md_io_clone->sectors);
- mddev->bitmap_ops->start_write(mddev, md_io_clone->offset,
- md_io_clone->sectors);
+ fn(mddev, md_io_clone->offset, md_io_clone->sectors);
}
static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone)
{
- mddev->bitmap_ops->end_write(mddev, md_io_clone->offset,
- md_io_clone->sectors);
+ md_bitmap_fn *fn = unlikely(md_io_clone->rw == STAT_DISCARD) ?
+ mddev->bitmap_ops->end_discard :
+ mddev->bitmap_ops->end_write;
+
+ fn(mddev, md_io_clone->offset, md_io_clone->sectors);
}
static void md_end_clone_io(struct bio *bio)
@@ -8893,7 +9081,7 @@ static void md_end_clone_io(struct bio *bio)
struct bio *orig_bio = md_io_clone->orig_bio;
struct mddev *mddev = md_io_clone->mddev;
- if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap)
+ if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false))
md_bitmap_end(mddev, md_io_clone);
if (bio->bi_status && !orig_bio->bi_status)
@@ -8920,9 +9108,10 @@ static void md_clone_bio(struct mddev *mddev, struct bio **bio)
if (blk_queue_io_stat(bdev->bd_disk->queue))
md_io_clone->start_time = bio_start_io_acct(*bio);
- if (bio_data_dir(*bio) == WRITE && mddev->bitmap) {
+ if (bio_data_dir(*bio) == WRITE && md_bitmap_enabled(mddev, false)) {
md_io_clone->offset = (*bio)->bi_iter.bi_sector;
md_io_clone->sectors = bio_sectors(*bio);
+ md_io_clone->rw = op_stat_group(bio_op(*bio));
md_bitmap_start(mddev, md_io_clone);
}
@@ -8944,7 +9133,7 @@ void md_free_cloned_bio(struct bio *bio)
struct bio *orig_bio = md_io_clone->orig_bio;
struct mddev *mddev = md_io_clone->mddev;
- if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap)
+ if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false))
md_bitmap_end(mddev, md_io_clone);
if (bio->bi_status && !orig_bio->bi_status)
@@ -9010,6 +9199,39 @@ static sector_t md_sync_max_sectors(struct mddev *mddev,
}
}
+/*
+ * If lazy recovery is requested and all rdevs are in sync, select the rdev with
+ * the higest index to perfore recovery to build initial xor data, this is the
+ * same as old bitmap.
+ */
+static bool mddev_select_lazy_recover_rdev(struct mddev *mddev)
+{
+ struct md_rdev *recover_rdev = NULL;
+ struct md_rdev *rdev;
+ bool ret = false;
+
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev) {
+ if (rdev->raid_disk < 0)
+ continue;
+
+ if (test_bit(Faulty, &rdev->flags) ||
+ !test_bit(In_sync, &rdev->flags))
+ break;
+
+ if (!recover_rdev || recover_rdev->raid_disk < rdev->raid_disk)
+ recover_rdev = rdev;
+ }
+
+ if (recover_rdev) {
+ clear_bit(In_sync, &recover_rdev->flags);
+ ret = true;
+ }
+
+ rcu_read_unlock();
+ return ret;
+}
+
static sector_t md_sync_position(struct mddev *mddev, enum sync_action action)
{
sector_t start = 0;
@@ -9041,6 +9263,14 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action)
start = rdev->recovery_offset;
rcu_read_unlock();
+ /*
+ * If there are no spares, and raid456 lazy initial recover is
+ * requested.
+ */
+ if (test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery) &&
+ start == MaxSector && mddev_select_lazy_recover_rdev(mddev))
+ start = 0;
+
/* If there is a bitmap, we need to make sure all
* writes that started before we added a spare
* complete before we start doing a recovery.
@@ -9061,19 +9291,12 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action)
static bool sync_io_within_limit(struct mddev *mddev)
{
- int io_sectors;
-
/*
* For raid456, sync IO is stripe(4k) per IO, for other levels, it's
* RESYNC_PAGES(64k) per IO.
*/
- if (mddev->level == 4 || mddev->level == 5 || mddev->level == 6)
- io_sectors = 8;
- else
- io_sectors = 128;
-
return atomic_read(&mddev->recovery_active) <
- io_sectors * sync_io_depth(mddev);
+ (raid_is_456(mddev) ? 8 : 128) * sync_io_depth(mddev);
}
#define SYNC_MARKS 10
@@ -9278,6 +9501,12 @@ void md_do_sync(struct md_thread *thread)
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
break;
+ if (mddev->bitmap_ops && mddev->bitmap_ops->skip_sync_blocks) {
+ sectors = mddev->bitmap_ops->skip_sync_blocks(mddev, j);
+ if (sectors)
+ goto update;
+ }
+
sectors = mddev->pers->sync_request(mddev, j, max_sectors,
&skipped);
if (sectors == 0) {
@@ -9293,6 +9522,7 @@ void md_do_sync(struct md_thread *thread)
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
break;
+update:
j += sectors;
if (j > max_sectors)
/* when skipping, extra large numbers can be returned. */
@@ -9602,6 +9832,7 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares)
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+ clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery);
return true;
}
@@ -9610,6 +9841,7 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares)
remove_spares(mddev, NULL);
set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+ clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery);
return true;
}
@@ -9619,7 +9851,7 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares)
* re-add.
*/
*spares = remove_and_add_spares(mddev, NULL);
- if (*spares) {
+ if (*spares || test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery)) {
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
@@ -9677,7 +9909,7 @@ static void md_start_sync(struct work_struct *ws)
* We are adding a device or devices to an array which has the bitmap
* stored on all devices. So make sure all bitmap pages get written.
*/
- if (spares)
+ if (spares && md_bitmap_enabled(mddev, true))
mddev->bitmap_ops->write_all(mddev);
name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ?
@@ -9765,7 +9997,7 @@ static void unregister_sync_thread(struct mddev *mddev)
*/
void md_check_recovery(struct mddev *mddev)
{
- if (mddev->bitmap)
+ if (md_bitmap_enabled(mddev, false) && mddev->bitmap_ops->daemon_work)
mddev->bitmap_ops->daemon_work(mddev);
if (signal_pending(current)) {
@@ -9832,6 +10064,7 @@ void md_check_recovery(struct mddev *mddev)
}
clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+ clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery);
clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
@@ -9942,6 +10175,7 @@ void md_reap_sync_thread(struct mddev *mddev)
clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery);
/*
* We call mddev->cluster_ops->update_size here because sync_size could
* be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared,
@@ -10089,8 +10323,16 @@ static void md_geninit(void)
static int __init md_init(void)
{
- int ret = -ENOMEM;
+ int ret = md_bitmap_init();
+
+ if (ret)
+ return ret;
+ ret = md_llbitmap_init();
+ if (ret)
+ goto err_bitmap;
+
+ ret = -ENOMEM;
md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
if (!md_wq)
goto err_wq;
@@ -10099,11 +10341,6 @@ static int __init md_init(void)
if (!md_misc_wq)
goto err_misc_wq;
- md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND,
- 0);
- if (!md_bitmap_wq)
- goto err_bitmap_wq;
-
ret = __register_blkdev(MD_MAJOR, "md", md_probe);
if (ret < 0)
goto err_md;
@@ -10122,12 +10359,13 @@ static int __init md_init(void)
err_mdp:
unregister_blkdev(MD_MAJOR, "md");
err_md:
- destroy_workqueue(md_bitmap_wq);
-err_bitmap_wq:
destroy_workqueue(md_misc_wq);
err_misc_wq:
destroy_workqueue(md_wq);
err_wq:
+ md_llbitmap_exit();
+err_bitmap:
+ md_bitmap_exit();
return ret;
}
@@ -10145,7 +10383,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size));
if (ret)
pr_info("md-cluster: resize failed\n");
- else
+ else if (md_bitmap_enabled(mddev, false))
mddev->bitmap_ops->update_sb(mddev->bitmap);
}
@@ -10433,8 +10671,8 @@ static __exit void md_exit(void)
spin_unlock(&all_mddevs_lock);
destroy_workqueue(md_misc_wq);
- destroy_workqueue(md_bitmap_wq);
destroy_workqueue(md_wq);
+ md_bitmap_exit();
}
subsys_initcall(md_init);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 51af29a03079..1979c2d4fe89 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -26,7 +26,7 @@
enum md_submodule_type {
MD_PERSONALITY = 0,
MD_CLUSTER,
- MD_BITMAP, /* TODO */
+ MD_BITMAP,
};
enum md_submodule_id {
@@ -38,8 +38,9 @@ enum md_submodule_id {
ID_RAID6 = 6,
ID_RAID10 = 10,
ID_CLUSTER,
- ID_BITMAP, /* TODO */
- ID_LLBITMAP, /* TODO */
+ ID_BITMAP,
+ ID_LLBITMAP,
+ ID_BITMAP_NONE,
};
struct md_submodule_head {
@@ -565,6 +566,7 @@ struct mddev {
struct percpu_ref writes_pending;
int sync_checkers; /* # of threads checking writes_pending */
+ enum md_submodule_id bitmap_id;
void *bitmap; /* the bitmap for the device */
struct bitmap_operations *bitmap_ops;
struct {
@@ -665,6 +667,8 @@ enum recovery_flags {
MD_RECOVERY_RESHAPE,
/* remote node is running resync thread */
MD_RESYNCING_REMOTE,
+ /* raid456 lazy initial recover */
+ MD_RECOVERY_LAZY_RECOVER,
};
enum md_ro_state {
@@ -796,7 +800,6 @@ struct md_sysfs_entry {
ssize_t (*show)(struct mddev *, char *);
ssize_t (*store)(struct mddev *, const char *, size_t);
};
-extern const struct attribute_group md_bitmap_group;
static inline struct kernfs_node *sysfs_get_dirent_safe(struct kernfs_node *sd, char *name)
{
@@ -873,6 +876,7 @@ struct md_io_clone {
unsigned long start_time;
sector_t offset;
unsigned long sectors;
+ enum stat_group rw;
struct bio bio_clone;
};
@@ -909,8 +913,9 @@ void md_account_bio(struct mddev *mddev, struct bio **bio);
void md_free_cloned_bio(struct bio *bio);
extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio);
-extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
- sector_t sector, int size, struct page *page);
+void md_write_metadata(struct mddev *mddev, struct md_rdev *rdev,
+ sector_t sector, int size, struct page *page,
+ unsigned int offset);
extern int md_super_wait(struct mddev *mddev);
extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
struct page *page, blk_opf_t opf, bool metadata_op);
@@ -1013,7 +1018,6 @@ struct mdu_array_info_s;
struct mdu_disk_info_s;
extern int mdp_major;
-extern struct workqueue_struct *md_bitmap_wq;
void md_autostart_arrays(int part);
int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info);
int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info);
@@ -1034,6 +1038,12 @@ static inline bool mddev_is_dm(struct mddev *mddev)
return !mddev->gendisk;
}
+static inline bool raid_is_456(struct mddev *mddev)
+{
+ return mddev->level == ID_RAID4 || mddev->level == ID_RAID5 ||
+ mddev->level == ID_RAID6;
+}
+
static inline void mddev_trace_remap(struct mddev *mddev, struct bio *bio,
sector_t sector)
{
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index f1d8811a542a..adc9e68d064d 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -463,21 +463,16 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
zone = find_zone(conf, &start);
if (bio_end_sector(bio) > zone->zone_end) {
- struct bio *split = bio_split(bio,
- zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO,
- &mddev->bio_set);
-
- if (IS_ERR(split)) {
- bio->bi_status = errno_to_blk_status(PTR_ERR(split));
- bio_endio(bio);
+ bio = bio_submit_split_bioset(bio,
+ zone->zone_end - bio->bi_iter.bi_sector,
+ &mddev->bio_set);
+ if (!bio)
return;
- }
- bio_chain(split, bio);
- submit_bio_noacct(bio);
- bio = split;
+
end = zone->zone_end;
- } else
+ } else {
end = bio_end_sector(bio);
+ }
orig_end = end;
if (zone != conf->strip_zone)
@@ -612,17 +607,10 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
: sector_div(sector, chunk_sects));
if (sectors < bio_sectors(bio)) {
- struct bio *split = bio_split(bio, sectors, GFP_NOIO,
+ bio = bio_submit_split_bioset(bio, sectors,
&mddev->bio_set);
-
- if (IS_ERR(split)) {
- bio->bi_status = errno_to_blk_status(PTR_ERR(split));
- bio_endio(bio);
+ if (!bio)
return true;
- }
- bio_chain(split, bio);
- raid0_map_submit_bio(mddev, bio);
- bio = split;
}
raid0_map_submit_bio(mddev, bio);
diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c
index 52881e6032da..521625756128 100644
--- a/drivers/md/raid1-10.c
+++ b/drivers/md/raid1-10.c
@@ -140,7 +140,7 @@ static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio,
* If bitmap is not enabled, it's safe to submit the io directly, and
* this can get optimal performance.
*/
- if (!mddev->bitmap_ops->enabled(mddev)) {
+ if (!md_bitmap_enabled(mddev, true)) {
raid1_submit_write(bio);
return true;
}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 408c26398321..3f4ed2272ac4 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -167,7 +167,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
bio = bio_kmalloc(RESYNC_PAGES, gfp_flags);
if (!bio)
goto out_free_bio;
- bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0);
+ bio_init_inline(bio, NULL, RESYNC_PAGES, 0);
r1_bio->bios[j] = bio;
}
/*
@@ -1317,7 +1317,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
struct raid1_info *mirror;
struct bio *read_bio;
int max_sectors;
- int rdisk, error;
+ int rdisk;
bool r1bio_existed = !!r1_bio;
/*
@@ -1366,7 +1366,8 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
(unsigned long long)r1_bio->sector,
mirror->rdev->bdev);
- if (test_bit(WriteMostly, &mirror->rdev->flags)) {
+ if (test_bit(WriteMostly, &mirror->rdev->flags) &&
+ md_bitmap_enabled(mddev, false)) {
/*
* Reading from a write-mostly device must take care not to
* over-take any writes that are 'behind'
@@ -1376,16 +1377,13 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
}
if (max_sectors < bio_sectors(bio)) {
- struct bio *split = bio_split(bio, max_sectors,
- gfp, &conf->bio_split);
-
- if (IS_ERR(split)) {
- error = PTR_ERR(split);
+ bio = bio_submit_split_bioset(bio, max_sectors,
+ &conf->bio_split);
+ if (!bio) {
+ set_bit(R1BIO_Returned, &r1_bio->state);
goto err_handle;
}
- bio_chain(split, bio);
- submit_bio_noacct(bio);
- bio = split;
+
r1_bio->master_bio = bio;
r1_bio->sectors = max_sectors;
}
@@ -1413,8 +1411,6 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
err_handle:
atomic_dec(&mirror->rdev->nr_pending);
- bio->bi_status = errno_to_blk_status(error);
- set_bit(R1BIO_Uptodate, &r1_bio->state);
raid_end_bio_io(r1_bio);
}
@@ -1452,12 +1448,36 @@ retry:
return true;
}
+static void raid1_start_write_behind(struct mddev *mddev, struct r1bio *r1_bio,
+ struct bio *bio)
+{
+ unsigned long max_write_behind = mddev->bitmap_info.max_write_behind;
+ struct md_bitmap_stats stats;
+ int err;
+
+ /* behind write rely on bitmap, see bitmap_operations */
+ if (!md_bitmap_enabled(mddev, false))
+ return;
+
+ err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
+ if (err)
+ return;
+
+ /* Don't do behind IO if reader is waiting, or there are too many. */
+ if (!stats.behind_wait && stats.behind_writes < max_write_behind)
+ alloc_behind_master_bio(r1_bio, bio);
+
+ if (test_bit(R1BIO_BehindIO, &r1_bio->state))
+ mddev->bitmap_ops->start_behind_write(mddev);
+
+}
+
static void raid1_write_request(struct mddev *mddev, struct bio *bio,
int max_write_sectors)
{
struct r1conf *conf = mddev->private;
struct r1bio *r1_bio;
- int i, disks, k, error;
+ int i, disks, k;
unsigned long flags;
int first_clone;
int max_sectors;
@@ -1561,10 +1581,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
* complexity of supporting that is not worth
* the benefit.
*/
- if (bio->bi_opf & REQ_ATOMIC) {
- error = -EIO;
+ if (bio->bi_opf & REQ_ATOMIC)
goto err_handle;
- }
good_sectors = first_bad - r1_bio->sector;
if (good_sectors < max_sectors)
@@ -1584,16 +1602,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
max_sectors = min_t(int, max_sectors,
BIO_MAX_VECS * (PAGE_SIZE >> 9));
if (max_sectors < bio_sectors(bio)) {
- struct bio *split = bio_split(bio, max_sectors,
- GFP_NOIO, &conf->bio_split);
-
- if (IS_ERR(split)) {
- error = PTR_ERR(split);
+ bio = bio_submit_split_bioset(bio, max_sectors,
+ &conf->bio_split);
+ if (!bio) {
+ set_bit(R1BIO_Returned, &r1_bio->state);
goto err_handle;
}
- bio_chain(split, bio);
- submit_bio_noacct(bio);
- bio = split;
+
r1_bio->master_bio = bio;
r1_bio->sectors = max_sectors;
}
@@ -1612,22 +1627,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
continue;
if (first_clone) {
- unsigned long max_write_behind =
- mddev->bitmap_info.max_write_behind;
- struct md_bitmap_stats stats;
- int err;
-
- /* do behind I/O ?
- * Not if there are too many, or cannot
- * allocate memory, or a reader on WriteMostly
- * is waiting for behind writes to flush */
- err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
- if (!err && write_behind && !stats.behind_wait &&
- stats.behind_writes < max_write_behind)
- alloc_behind_master_bio(r1_bio, bio);
-
- if (test_bit(R1BIO_BehindIO, &r1_bio->state))
- mddev->bitmap_ops->start_behind_write(mddev);
+ if (write_behind)
+ raid1_start_write_behind(mddev, r1_bio, bio);
first_clone = 0;
}
@@ -1683,8 +1684,6 @@ err_handle:
}
}
- bio->bi_status = errno_to_blk_status(error);
- set_bit(R1BIO_Uptodate, &r1_bio->state);
raid_end_bio_io(r1_bio);
}
@@ -2057,7 +2056,7 @@ static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio)
/* make sure these bits don't get cleared. */
do {
- mddev->bitmap_ops->end_sync(mddev, s, &sync_blocks);
+ md_bitmap_end_sync(mddev, s, &sync_blocks);
s += sync_blocks;
sectors_to_go -= sync_blocks;
} while (sectors_to_go > 0);
@@ -2804,12 +2803,13 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
* We can find the current addess in mddev->curr_resync
*/
if (mddev->curr_resync < max_sector) /* aborted */
- mddev->bitmap_ops->end_sync(mddev, mddev->curr_resync,
- &sync_blocks);
+ md_bitmap_end_sync(mddev, mddev->curr_resync,
+ &sync_blocks);
else /* completed sync */
conf->fullsync = 0;
- mddev->bitmap_ops->close_sync(mddev);
+ if (md_bitmap_enabled(mddev, false))
+ mddev->bitmap_ops->close_sync(mddev);
close_sync(conf);
if (mddev_is_clustered(mddev)) {
@@ -2829,7 +2829,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
/* before building a request, check if we can skip these blocks..
* This call the bitmap_start_sync doesn't actually record anything
*/
- if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks, true) &&
+ if (!md_bitmap_start_sync(mddev, sector_nr, &sync_blocks, true) &&
!conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
/* We can skip this block, and probably several more */
*skipped = 1;
@@ -2846,10 +2846,11 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
/* we are incrementing sector_nr below. To be safe, we check against
* sector_nr + two times RESYNC_SECTORS
*/
-
- mddev->bitmap_ops->cond_end_sync(mddev, sector_nr,
- mddev_is_clustered(mddev) &&
- (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
+ if (md_bitmap_enabled(mddev, false))
+ mddev->bitmap_ops->cond_end_sync(mddev, sector_nr,
+ mddev_is_clustered(mddev) &&
+ (sector_nr + 2 * RESYNC_SECTORS >
+ conf->cluster_sync_high));
if (raise_barrier(conf, sector_nr))
return 0;
@@ -3004,8 +3005,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
if (len == 0)
break;
if (sync_blocks == 0) {
- if (!mddev->bitmap_ops->start_sync(mddev, sector_nr,
- &sync_blocks, still_degraded) &&
+ if (!md_bitmap_start_sync(mddev, sector_nr,
+ &sync_blocks, still_degraded) &&
!conf->fullsync &&
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
break;
@@ -3324,15 +3325,17 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors)
* worth it.
*/
sector_t newsize = raid1_size(mddev, sectors, 0);
- int ret;
if (mddev->external_size &&
mddev->array_sectors > newsize)
return -EINVAL;
- ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false);
- if (ret)
- return ret;
+ if (md_bitmap_enabled(mddev, false)) {
+ int ret = mddev->bitmap_ops->resize(mddev, newsize, 0);
+
+ if (ret)
+ return ret;
+ }
md_set_array_sectors(mddev, newsize);
if (sectors > mddev->dev_sectors &&
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index d236ef179cfb..2ebe35aaa534 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -178,7 +178,9 @@ enum r1bio_state {
* any write was successful. Otherwise we call when
* any write-behind write succeeds, otherwise we call
* with failure when last write completes (and all failed).
- * Record that bi_end_io was called with this flag...
+ *
+ * And for bio_split errors, record that bi_end_io was called
+ * with this flag...
*/
R1BIO_Returned,
/* If a write for this request means we can clear some
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index b60c30bfb6c7..8996fa0dff60 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -163,14 +163,14 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
bio = bio_kmalloc(RESYNC_PAGES, gfp_flags);
if (!bio)
goto out_free_bio;
- bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0);
+ bio_init_inline(bio, NULL, RESYNC_PAGES, 0);
r10_bio->devs[j].bio = bio;
if (!conf->have_replacement)
continue;
bio = bio_kmalloc(RESYNC_PAGES, gfp_flags);
if (!bio)
goto out_free_bio;
- bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0);
+ bio_init_inline(bio, NULL, RESYNC_PAGES, 0);
r10_bio->devs[j].repl_bio = bio;
}
/*
@@ -322,10 +322,12 @@ static void raid_end_bio_io(struct r10bio *r10_bio)
struct bio *bio = r10_bio->master_bio;
struct r10conf *conf = r10_bio->mddev->private;
- if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
- bio->bi_status = BLK_STS_IOERR;
+ if (!test_and_set_bit(R10BIO_Returned, &r10_bio->state)) {
+ if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
+ bio->bi_status = BLK_STS_IOERR;
+ bio_endio(bio);
+ }
- bio_endio(bio);
/*
* Wake up any possible resync thread that waits for the device
* to go idle.
@@ -1154,7 +1156,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
int slot = r10_bio->read_slot;
struct md_rdev *err_rdev = NULL;
gfp_t gfp = GFP_NOIO;
- int error;
if (slot >= 0 && r10_bio->devs[slot].rdev) {
/*
@@ -1203,17 +1204,15 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
rdev->bdev,
(unsigned long long)r10_bio->sector);
if (max_sectors < bio_sectors(bio)) {
- struct bio *split = bio_split(bio, max_sectors,
- gfp, &conf->bio_split);
- if (IS_ERR(split)) {
- error = PTR_ERR(split);
- goto err_handle;
- }
- bio_chain(split, bio);
allow_barrier(conf);
- submit_bio_noacct(bio);
+ bio = bio_submit_split_bioset(bio, max_sectors,
+ &conf->bio_split);
wait_barrier(conf, false);
- bio = split;
+ if (!bio) {
+ set_bit(R10BIO_Returned, &r10_bio->state);
+ goto err_handle;
+ }
+
r10_bio->master_bio = bio;
r10_bio->sectors = max_sectors;
}
@@ -1241,8 +1240,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
return;
err_handle:
atomic_dec(&rdev->nr_pending);
- bio->bi_status = errno_to_blk_status(error);
- set_bit(R10BIO_Uptodate, &r10_bio->state);
raid_end_bio_io(r10_bio);
}
@@ -1351,7 +1348,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
int i, k;
sector_t sectors;
int max_sectors;
- int error;
if ((mddev_is_clustered(mddev) &&
mddev->cluster_ops->area_resyncing(mddev, WRITE,
@@ -1465,10 +1461,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
* complexity of supporting that is not worth
* the benefit.
*/
- if (bio->bi_opf & REQ_ATOMIC) {
- error = -EIO;
+ if (bio->bi_opf & REQ_ATOMIC)
goto err_handle;
- }
good_sectors = first_bad - dev_sector;
if (good_sectors < max_sectors)
@@ -1489,17 +1483,15 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
r10_bio->sectors = max_sectors;
if (r10_bio->sectors < bio_sectors(bio)) {
- struct bio *split = bio_split(bio, r10_bio->sectors,
- GFP_NOIO, &conf->bio_split);
- if (IS_ERR(split)) {
- error = PTR_ERR(split);
- goto err_handle;
- }
- bio_chain(split, bio);
allow_barrier(conf);
- submit_bio_noacct(bio);
+ bio = bio_submit_split_bioset(bio, r10_bio->sectors,
+ &conf->bio_split);
wait_barrier(conf, false);
- bio = split;
+ if (!bio) {
+ set_bit(R10BIO_Returned, &r10_bio->state);
+ goto err_handle;
+ }
+
r10_bio->master_bio = bio;
}
@@ -1531,8 +1523,6 @@ err_handle:
}
}
- bio->bi_status = errno_to_blk_status(error);
- set_bit(R10BIO_Uptodate, &r10_bio->state);
raid_end_bio_io(r10_bio);
}
@@ -1679,7 +1669,9 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
bio_endio(bio);
return 0;
}
+
bio_chain(split, bio);
+ trace_block_split(split, bio->bi_iter.bi_sector);
allow_barrier(conf);
/* Resend the fist split part */
submit_bio_noacct(split);
@@ -1694,7 +1686,9 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
bio_endio(bio);
return 0;
}
+
bio_chain(split, bio);
+ trace_block_split(split, bio->bi_iter.bi_sector);
allow_barrier(conf);
/* Resend the second split part */
submit_bio_noacct(bio);
@@ -3221,15 +3215,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
if (mddev->curr_resync < max_sector) { /* aborted */
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
- mddev->bitmap_ops->end_sync(mddev,
- mddev->curr_resync,
- &sync_blocks);
+ md_bitmap_end_sync(mddev, mddev->curr_resync,
+ &sync_blocks);
else for (i = 0; i < conf->geo.raid_disks; i++) {
sector_t sect =
raid10_find_virt(conf, mddev->curr_resync, i);
- mddev->bitmap_ops->end_sync(mddev, sect,
- &sync_blocks);
+ md_bitmap_end_sync(mddev, sect, &sync_blocks);
}
} else {
/* completed sync */
@@ -3249,7 +3241,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
}
conf->fullsync = 0;
}
- mddev->bitmap_ops->close_sync(mddev);
+ if (md_bitmap_enabled(mddev, false))
+ mddev->bitmap_ops->close_sync(mddev);
close_sync(conf);
*skipped = 1;
return sectors_skipped;
@@ -3351,9 +3344,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
* we only need to recover the block if it is set in
* the bitmap
*/
- must_sync = mddev->bitmap_ops->start_sync(mddev, sect,
- &sync_blocks,
- true);
+ must_sync = md_bitmap_start_sync(mddev, sect,
+ &sync_blocks, true);
if (sync_blocks < max_sync)
max_sync = sync_blocks;
if (!must_sync &&
@@ -3396,9 +3388,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
}
}
- must_sync = mddev->bitmap_ops->start_sync(mddev, sect,
- &sync_blocks, still_degraded);
-
+ md_bitmap_start_sync(mddev, sect, &sync_blocks,
+ still_degraded);
any_working = 0;
for (j=0; j<conf->copies;j++) {
int k;
@@ -3570,13 +3561,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
* safety reason, which ensures curr_resync_completed is
* updated in bitmap_cond_end_sync.
*/
- mddev->bitmap_ops->cond_end_sync(mddev, sector_nr,
+ if (md_bitmap_enabled(mddev, false))
+ mddev->bitmap_ops->cond_end_sync(mddev, sector_nr,
mddev_is_clustered(mddev) &&
(sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
- if (!mddev->bitmap_ops->start_sync(mddev, sector_nr,
- &sync_blocks,
- mddev->degraded) &&
+ if (!md_bitmap_start_sync(mddev, sector_nr, &sync_blocks,
+ mddev->degraded) &&
!conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
&mddev->recovery)) {
/* We can skip this block */
@@ -4225,7 +4216,6 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors)
*/
struct r10conf *conf = mddev->private;
sector_t oldsize, size;
- int ret;
if (mddev->reshape_position != MaxSector)
return -EBUSY;
@@ -4239,9 +4229,12 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors)
mddev->array_sectors > size)
return -EINVAL;
- ret = mddev->bitmap_ops->resize(mddev, size, 0, false);
- if (ret)
- return ret;
+ if (md_bitmap_enabled(mddev, false)) {
+ int ret = mddev->bitmap_ops->resize(mddev, size, 0);
+
+ if (ret)
+ return ret;
+ }
md_set_array_sectors(mddev, size);
if (sectors > mddev->dev_sectors &&
@@ -4507,8 +4500,9 @@ static int raid10_start_reshape(struct mddev *mddev)
oldsize = raid10_size(mddev, 0, 0);
newsize = raid10_size(mddev, 0, conf->geo.raid_disks);
- if (!mddev_is_clustered(mddev)) {
- ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false);
+ if (!mddev_is_clustered(mddev) &&
+ md_bitmap_enabled(mddev, false)) {
+ ret = mddev->bitmap_ops->resize(mddev, newsize, 0);
if (ret)
goto abort;
else
@@ -4530,13 +4524,14 @@ static int raid10_start_reshape(struct mddev *mddev)
MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize))
goto out;
- ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false);
+ /* cluster can't be setup without bitmap */
+ ret = mddev->bitmap_ops->resize(mddev, newsize, 0);
if (ret)
goto abort;
ret = mddev->cluster_ops->resize_bitmaps(mddev, newsize, oldsize);
if (ret) {
- mddev->bitmap_ops->resize(mddev, oldsize, 0, false);
+ mddev->bitmap_ops->resize(mddev, oldsize, 0);
goto abort;
}
}
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 3f16ad6904a9..da00a55f7a55 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -165,6 +165,8 @@ enum r10bio_state {
* so that raid10d knows what to do with them.
*/
R10BIO_ReadError,
+/* For bio_split errors, record that bi_end_io was called. */
+ R10BIO_Returned,
/* If a write for this request means we can clear some
* known-bad-block records, we set this flag.
*/
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 023649fe2476..04add718be77 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4097,7 +4097,8 @@ static int handle_stripe_dirtying(struct r5conf *conf,
int disks)
{
int rmw = 0, rcw = 0, i;
- sector_t resync_offset = conf->mddev->resync_offset;
+ struct mddev *mddev = conf->mddev;
+ sector_t resync_offset = mddev->resync_offset;
/* Check whether resync is now happening or should start.
* If yes, then the array is dirty (after unclean shutdown or
@@ -4116,6 +4117,12 @@ static int handle_stripe_dirtying(struct r5conf *conf,
pr_debug("force RCW rmw_level=%u, resync_offset=%llu sh->sector=%llu\n",
conf->rmw_level, (unsigned long long)resync_offset,
(unsigned long long)sh->sector);
+ } else if (mddev->bitmap_ops && mddev->bitmap_ops->blocks_synced &&
+ !mddev->bitmap_ops->blocks_synced(mddev, sh->sector)) {
+ /* The initial recover is not done, must read everything */
+ rcw = 1; rmw = 2;
+ pr_debug("force RCW by lazy recovery, sh->sector=%llu\n",
+ sh->sector);
} else for (i = disks; i--; ) {
/* would I have to read this buffer for read_modify_write */
struct r5dev *dev = &sh->dev[i];
@@ -4148,7 +4155,7 @@ static int handle_stripe_dirtying(struct r5conf *conf,
set_bit(STRIPE_HANDLE, &sh->state);
if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
/* prefer read-modify-write, but need to get some data */
- mddev_add_trace_msg(conf->mddev, "raid5 rmw %llu %d",
+ mddev_add_trace_msg(mddev, "raid5 rmw %llu %d",
sh->sector, rmw);
for (i = disks; i--; ) {
@@ -4227,8 +4234,8 @@ static int handle_stripe_dirtying(struct r5conf *conf,
set_bit(STRIPE_DELAYED, &sh->state);
}
}
- if (rcw && !mddev_is_dm(conf->mddev))
- blk_add_trace_msg(conf->mddev->gendisk->queue,
+ if (rcw && !mddev_is_dm(mddev))
+ blk_add_trace_msg(mddev->gendisk->queue,
"raid5 rcw %llu %d %d %d",
(unsigned long long)sh->sector, rcw, qread,
test_bit(STRIPE_DELAYED, &sh->state));
@@ -4698,10 +4705,21 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
}
} else if (test_bit(In_sync, &rdev->flags))
set_bit(R5_Insync, &dev->flags);
- else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= rdev->recovery_offset)
- /* in sync if before recovery_offset */
- set_bit(R5_Insync, &dev->flags);
- else if (test_bit(R5_UPTODATE, &dev->flags) &&
+ else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <=
+ rdev->recovery_offset) {
+ /*
+ * in sync if:
+ * - normal IO, or
+ * - resync IO that is not lazy recovery
+ *
+ * For lazy recovery, we have to mark the rdev without
+ * In_sync as failed, to build initial xor data.
+ */
+ if (!test_bit(STRIPE_SYNCING, &sh->state) ||
+ !test_bit(MD_RECOVERY_LAZY_RECOVER,
+ &conf->mddev->recovery))
+ set_bit(R5_Insync, &dev->flags);
+ } else if (test_bit(R5_UPTODATE, &dev->flags) &&
test_bit(R5_Expanded, &dev->flags))
/* If we've reshaped into here, we assume it is Insync.
* We will shortly update recovery_offset to make
@@ -5468,17 +5486,17 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
{
- struct bio *split;
sector_t sector = raid_bio->bi_iter.bi_sector;
unsigned chunk_sects = mddev->chunk_sectors;
unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
if (sectors < bio_sectors(raid_bio)) {
struct r5conf *conf = mddev->private;
- split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split);
- bio_chain(split, raid_bio);
- submit_bio_noacct(raid_bio);
- raid_bio = split;
+
+ raid_bio = bio_submit_split_bioset(raid_bio, sectors,
+ &conf->bio_split);
+ if (!raid_bio)
+ return NULL;
}
if (!raid5_read_one_chunk(mddev, raid_bio))
@@ -6492,11 +6510,12 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
}
if (mddev->curr_resync < max_sector) /* aborted */
- mddev->bitmap_ops->end_sync(mddev, mddev->curr_resync,
- &sync_blocks);
+ md_bitmap_end_sync(mddev, mddev->curr_resync,
+ &sync_blocks);
else /* completed sync */
conf->fullsync = 0;
- mddev->bitmap_ops->close_sync(mddev);
+ if (md_bitmap_enabled(mddev, false))
+ mddev->bitmap_ops->close_sync(mddev);
return 0;
}
@@ -6525,8 +6544,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
}
if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
!conf->fullsync &&
- !mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks,
- true) &&
+ !md_bitmap_start_sync(mddev, sector_nr, &sync_blocks, true) &&
sync_blocks >= RAID5_STRIPE_SECTORS(conf)) {
/* we can skip this block, and probably more */
do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf));
@@ -6535,7 +6553,8 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
return sync_blocks * RAID5_STRIPE_SECTORS(conf);
}
- mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, false);
+ if (md_bitmap_enabled(mddev, false))
+ mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, false);
sh = raid5_get_active_stripe(conf, NULL, sector_nr,
R5_GAS_NOBLOCK);
@@ -6557,9 +6576,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
still_degraded = true;
}
- mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks,
- still_degraded);
-
+ md_bitmap_start_sync(mddev, sector_nr, &sync_blocks, still_degraded);
set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
@@ -6763,7 +6780,8 @@ static void raid5d(struct md_thread *thread)
/* Now is a good time to flush some bitmap updates */
conf->seq_flush++;
spin_unlock_irq(&conf->device_lock);
- mddev->bitmap_ops->unplug(mddev, true);
+ if (md_bitmap_enabled(mddev, true))
+ mddev->bitmap_ops->unplug(mddev, true);
spin_lock_irq(&conf->device_lock);
conf->seq_write = conf->seq_flush;
activate_bit_delay(conf, conf->temp_inactive_list);
@@ -8312,7 +8330,6 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
*/
sector_t newsize;
struct r5conf *conf = mddev->private;
- int ret;
if (raid5_has_log(conf) || raid5_has_ppl(conf))
return -EINVAL;
@@ -8322,9 +8339,12 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
mddev->array_sectors > newsize)
return -EINVAL;
- ret = mddev->bitmap_ops->resize(mddev, sectors, 0, false);
- if (ret)
- return ret;
+ if (md_bitmap_enabled(mddev, false)) {
+ int ret = mddev->bitmap_ops->resize(mddev, sectors, 0);
+
+ if (ret)
+ return ret;
+ }
md_set_array_sectors(mddev, newsize);
if (sectors > mddev->dev_sectors &&
diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c
index d34892782f6e..1af157ce0a63 100644
--- a/drivers/memstick/core/ms_block.c
+++ b/drivers/memstick/core/ms_block.c
@@ -1953,10 +1953,10 @@ static void msb_data_clear(struct msb_data *msb)
msb->card = NULL;
}
-static int msb_bd_getgeo(struct block_device *bdev,
+static int msb_bd_getgeo(struct gendisk *disk,
struct hd_geometry *geo)
{
- struct msb_data *msb = bdev->bd_disk->private_data;
+ struct msb_data *msb = disk->private_data;
*geo = msb->geometry;
return 0;
}
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index c9853d887d28..075519caa547 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -189,10 +189,10 @@ static void mspro_block_bd_free_disk(struct gendisk *disk)
kfree(msb);
}
-static int mspro_block_bd_getgeo(struct block_device *bdev,
+static int mspro_block_bd_getgeo(struct gendisk *disk,
struct hd_geometry *geo)
{
- struct mspro_block_data *msb = bdev->bd_disk->private_data;
+ struct mspro_block_data *msb = disk->private_data;
geo->heads = msb->heads;
geo->sectors = msb->sectors_per_track;
diff --git a/drivers/message/fusion/mptscsih.c b/drivers/message/fusion/mptscsih.c
index 3a64dc7a7e27..3304f8824cf7 100644
--- a/drivers/message/fusion/mptscsih.c
+++ b/drivers/message/fusion/mptscsih.c
@@ -2074,7 +2074,7 @@ mptscsih_taskmgmt_complete(MPT_ADAPTER *ioc, MPT_FRAME_HDR *mf,
* This is anyones guess quite frankly.
*/
int
-mptscsih_bios_param(struct scsi_device * sdev, struct block_device *bdev,
+mptscsih_bios_param(struct scsi_device * sdev, struct gendisk *unused,
sector_t capacity, int geom[])
{
int heads;
diff --git a/drivers/message/fusion/mptscsih.h b/drivers/message/fusion/mptscsih.h
index 8c2bb2331fc1..f9678d48100c 100644
--- a/drivers/message/fusion/mptscsih.h
+++ b/drivers/message/fusion/mptscsih.h
@@ -123,7 +123,7 @@ extern int mptscsih_abort(struct scsi_cmnd * SCpnt);
extern int mptscsih_dev_reset(struct scsi_cmnd * SCpnt);
extern int mptscsih_bus_reset(struct scsi_cmnd * SCpnt);
extern int mptscsih_host_reset(struct scsi_cmnd *SCpnt);
-extern int mptscsih_bios_param(struct scsi_device * sdev, struct block_device *bdev, sector_t capacity, int geom[]);
+extern int mptscsih_bios_param(struct scsi_device * sdev, struct gendisk *unused, sector_t capacity, int geom[]);
extern int mptscsih_io_done(MPT_ADAPTER *ioc, MPT_FRAME_HDR *mf, MPT_FRAME_HDR *r);
extern int mptscsih_taskmgmt_complete(MPT_ADAPTER *ioc, MPT_FRAME_HDR *mf, MPT_FRAME_HDR *r);
extern int mptscsih_scandv_complete(MPT_ADAPTER *ioc, MPT_FRAME_HDR *mf, MPT_FRAME_HDR *r);
diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index 9cc47bf94804..d1f295af9713 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -435,9 +435,9 @@ static void mmc_blk_release(struct gendisk *disk)
}
static int
-mmc_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+mmc_blk_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
- geo->cylinders = get_capacity(bdev->bd_disk) / (4 * 16);
+ geo->cylinders = get_capacity(disk) / (4 * 16);
geo->heads = 4;
geo->sectors = 16;
return 0;
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 847c11542f02..28e09d080440 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -246,9 +246,9 @@ unlock:
blktrans_dev_put(dev);
}
-static int blktrans_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+static int blktrans_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
- struct mtd_blktrans_dev *dev = bdev->bd_disk->private_data;
+ struct mtd_blktrans_dev *dev = disk->private_data;
int ret = -ENXIO;
mutex_lock(&dev->lock);
diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c
index 39cc0a6a4d37..b53fd147fa65 100644
--- a/drivers/mtd/ubi/block.c
+++ b/drivers/mtd/ubi/block.c
@@ -282,12 +282,12 @@ static void ubiblock_release(struct gendisk *gd)
mutex_unlock(&dev->dev_mutex);
}
-static int ubiblock_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+static int ubiblock_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
/* Some tools might require this information */
geo->heads = 1;
geo->cylinders = 1;
- geo->sectors = get_capacity(bdev->bd_disk);
+ geo->sectors = get_capacity(disk);
geo->start = 0;
return 0;
}
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 2a1aa32e6693..a933db961ed7 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1478,12 +1478,12 @@ static void btt_submit_bio(struct bio *bio)
bio_endio(bio);
}
-static int btt_getgeo(struct block_device *bd, struct hd_geometry *geo)
+static int btt_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
/* some standard values */
geo->heads = 1 << 6;
geo->sectors = 1 << 5;
- geo->cylinders = get_capacity(bd->bd_disk) >> 11;
+ geo->cylinders = get_capacity(disk) >> 11;
return 0;
}
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 42c5f05680f1..734ad725e6f4 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1803,12 +1803,12 @@ static void nvme_release(struct gendisk *disk)
nvme_ns_release(disk->private_data);
}
-int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+int nvme_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
/* some standard values */
geo->heads = 1 << 6;
geo->sectors = 1 << 5;
- geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
+ geo->cylinders = get_capacity(disk) >> 11;
return 0;
}
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 6b3ac8ae3f34..f778f3b5214b 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -142,14 +142,9 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer,
ret = blk_rq_map_user_io(req, NULL, nvme_to_user_ptr(ubuffer),
bufflen, GFP_KERNEL, flags & NVME_IOCTL_VEC, 0,
0, rq_data_dir(req));
-
if (ret)
return ret;
- bio = req->bio;
- if (bdev)
- bio_set_dev(bio, bdev);
-
if (has_metadata) {
ret = blk_rq_integrity_map_user(req, meta_buffer, meta_len);
if (ret)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index cfd2b5b90b91..102fae6a231c 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -936,7 +936,7 @@ int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
unsigned int issue_flags);
int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
struct nvme_id_ns **id);
-int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo);
+int nvme_getgeo(struct gendisk *disk, struct hd_geometry *geo);
int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
extern const struct attribute_group *nvme_ns_attr_groups[];
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 4c6ee8f9f681..c916176bd9f0 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -260,12 +260,6 @@ enum nvme_iod_flags {
/* single segment dma mapping */
IOD_SINGLE_SEGMENT = 1U << 2,
- /* DMA mapped with PCI_P2PDMA_MAP_BUS_ADDR */
- IOD_P2P_BUS_ADDR = 1U << 3,
-
- /* Metadata DMA mapped with PCI_P2PDMA_MAP_BUS_ADDR */
- IOD_META_P2P_BUS_ADDR = 1U << 4,
-
/* Metadata using non-coalesced MPTR */
IOD_SINGLE_META_SEGMENT = 1U << 5,
};
@@ -737,9 +731,8 @@ static void nvme_unmap_metadata(struct request *req)
return;
}
- if (!blk_rq_dma_unmap(req, dma_dev, &iod->meta_dma_state,
- iod->meta_total_len,
- iod->flags & IOD_META_P2P_BUS_ADDR)) {
+ if (!blk_rq_integrity_dma_unmap(req, dma_dev, &iod->meta_dma_state,
+ iod->meta_total_len)) {
if (nvme_pci_cmd_use_meta_sgl(&iod->cmd))
nvme_free_sgls(req, sge, &sge[1]);
else
@@ -766,8 +759,7 @@ static void nvme_unmap_data(struct request *req)
return;
}
- if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len,
- iod->flags & IOD_P2P_BUS_ADDR)) {
+ if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len)) {
if (nvme_pci_cmd_use_sgl(&iod->cmd))
nvme_free_sgls(req, iod->descriptors[0],
&iod->cmd.common.dptr.sgl);
@@ -1043,9 +1035,6 @@ static blk_status_t nvme_map_data(struct request *req)
if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_state, &iter))
return iter.status;
- if (iter.p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR)
- iod->flags |= IOD_P2P_BUS_ADDR;
-
if (use_sgl == SGL_FORCED ||
(use_sgl == SGL_SUPPORTED &&
(sgl_threshold && nvme_pci_avg_seg_size(req) >= sgl_threshold)))
@@ -1068,9 +1057,7 @@ static blk_status_t nvme_pci_setup_meta_sgls(struct request *req)
&iod->meta_dma_state, &iter))
return iter.status;
- if (iter.p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR)
- iod->flags |= IOD_META_P2P_BUS_ADDR;
- else if (blk_rq_dma_map_coalesce(&iod->meta_dma_state))
+ if (blk_rq_dma_map_coalesce(&iod->meta_dma_state))
entries = 1;
/*
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 506a947d00a5..582ac7e61b24 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -3317,11 +3317,11 @@ static void dasd_release(struct gendisk *disk)
/*
* Return disk geometry.
*/
-static int dasd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+static int dasd_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
struct dasd_device *base;
- base = dasd_device_from_gendisk(bdev->bd_disk);
+ base = dasd_device_from_gendisk(disk);
if (!base)
return -ENODEV;
@@ -3331,7 +3331,8 @@ static int dasd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
return -EINVAL;
}
base->discipline->fill_geometry(base->block, geo);
- geo->start = get_start_sect(bdev) >> base->block->s2b_shift;
+ // geo->start is left unchanged by the above
+ geo->start >>= base->block->s2b_shift;
dasd_put_device(base);
return 0;
}
diff --git a/drivers/scsi/3w-9xxx.c b/drivers/scsi/3w-9xxx.c
index 883d4a12a172..a377a6f6900a 100644
--- a/drivers/scsi/3w-9xxx.c
+++ b/drivers/scsi/3w-9xxx.c
@@ -1695,7 +1695,7 @@ out:
} /* End twa_reset_sequence() */
/* This funciton returns unit geometry in cylinders/heads/sectors */
-static int twa_scsi_biosparam(struct scsi_device *sdev, struct block_device *bdev, sector_t capacity, int geom[])
+static int twa_scsi_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[])
{
int heads, sectors, cylinders;
diff --git a/drivers/scsi/3w-sas.c b/drivers/scsi/3w-sas.c
index 8d4174c7107e..e319be7d369c 100644
--- a/drivers/scsi/3w-sas.c
+++ b/drivers/scsi/3w-sas.c
@@ -1404,7 +1404,7 @@ out:
} /* End twl_reset_device_extension() */
/* This funciton returns unit geometry in cylinders/heads/sectors */
-static int twl_scsi_biosparam(struct scsi_device *sdev, struct block_device *bdev, sector_t capacity, int geom[])
+static int twl_scsi_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[])
{
int heads, sectors;
diff --git a/drivers/scsi/3w-xxxx.c b/drivers/scsi/3w-xxxx.c
index 89bd56f78ef9..0306a228c702 100644
--- a/drivers/scsi/3w-xxxx.c
+++ b/drivers/scsi/3w-xxxx.c
@@ -1340,7 +1340,7 @@ static int tw_reset_device_extension(TW_Device_Extension *tw_dev)
} /* End tw_reset_device_extension() */
/* This funciton returns unit geometry in cylinders/heads/sectors */
-static int tw_scsi_biosparam(struct scsi_device *sdev, struct block_device *bdev,
+static int tw_scsi_biosparam(struct scsi_device *sdev, struct gendisk *unused,
sector_t capacity, int geom[])
{
int heads, sectors, cylinders;
diff --git a/drivers/scsi/BusLogic.c b/drivers/scsi/BusLogic.c
index 1f100270cd38..8b17d8103e90 100644
--- a/drivers/scsi/BusLogic.c
+++ b/drivers/scsi/BusLogic.c
@@ -3240,7 +3240,7 @@ static int blogic_resetadapter(struct blogic_adapter *adapter, bool hard_reset)
the BIOS, and a warning may be displayed.
*/
-static int blogic_diskparam(struct scsi_device *sdev, struct block_device *dev,
+static int blogic_diskparam(struct scsi_device *sdev, struct gendisk *disk,
sector_t capacity, int *params)
{
struct blogic_adapter *adapter =
@@ -3261,7 +3261,7 @@ static int blogic_diskparam(struct scsi_device *sdev, struct block_device *dev,
diskparam->sectors = 32;
}
diskparam->cylinders = (unsigned long) capacity / (diskparam->heads * diskparam->sectors);
- buf = scsi_bios_ptable(dev);
+ buf = scsi_bios_ptable(disk);
if (buf == NULL)
return 0;
/*
diff --git a/drivers/scsi/BusLogic.h b/drivers/scsi/BusLogic.h
index 61bf26d4fc10..79de815e33b0 100644
--- a/drivers/scsi/BusLogic.h
+++ b/drivers/scsi/BusLogic.h
@@ -1273,7 +1273,7 @@ static inline void blogic_incszbucket(unsigned int *cmdsz_buckets,
static const char *blogic_drvr_info(struct Scsi_Host *);
static int blogic_qcmd(struct Scsi_Host *h, struct scsi_cmnd *);
-static int blogic_diskparam(struct scsi_device *, struct block_device *, sector_t, int *);
+static int blogic_diskparam(struct scsi_device *, struct gendisk *, sector_t, int *);
static int blogic_sdev_configure(struct scsi_device *,
struct queue_limits *lim);
static void blogic_qcompleted_ccb(struct blogic_ccb *);
diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c
index 4b12e6dd8f07..ea66196ef7c7 100644
--- a/drivers/scsi/aacraid/linit.c
+++ b/drivers/scsi/aacraid/linit.c
@@ -273,7 +273,7 @@ struct aac_driver_ident* aac_get_driver_ident(int devtype)
/**
* aac_biosparm - return BIOS parameters for disk
* @sdev: The scsi device corresponding to the disk
- * @bdev: the block device corresponding to the disk
+ * @disk: the gendisk corresponding to the disk
* @capacity: the sector capacity of the disk
* @geom: geometry block to fill in
*
@@ -292,7 +292,7 @@ struct aac_driver_ident* aac_get_driver_ident(int devtype)
* be displayed.
*/
-static int aac_biosparm(struct scsi_device *sdev, struct block_device *bdev,
+static int aac_biosparm(struct scsi_device *sdev, struct gendisk *disk,
sector_t capacity, int *geom)
{
struct diskparm *param = (struct diskparm *)geom;
@@ -324,7 +324,7 @@ static int aac_biosparm(struct scsi_device *sdev, struct block_device *bdev,
* entry whose end_head matches one of the standard geometry
* translations ( 64/32, 128/32, 255/63 ).
*/
- buf = scsi_bios_ptable(bdev);
+ buf = scsi_bios_ptable(disk);
if (!buf)
return 0;
if (*(__le16 *)(buf + 0x40) == cpu_to_le16(MSDOS_LABEL_MAGIC)) {
diff --git a/drivers/scsi/advansys.c b/drivers/scsi/advansys.c
index 3a2c336307c0..063e1b5818d3 100644
--- a/drivers/scsi/advansys.c
+++ b/drivers/scsi/advansys.c
@@ -7096,7 +7096,7 @@ static int advansys_reset(struct scsi_cmnd *scp)
* ip[2]: cylinders
*/
static int
-advansys_biosparam(struct scsi_device *sdev, struct block_device *bdev,
+advansys_biosparam(struct scsi_device *sdev, struct gendisk *unused,
sector_t capacity, int ip[])
{
struct asc_board *boardp = shost_priv(sdev->host);
diff --git a/drivers/scsi/aha152x.c b/drivers/scsi/aha152x.c
index e94c0a19c435..182aa80ec4c6 100644
--- a/drivers/scsi/aha152x.c
+++ b/drivers/scsi/aha152x.c
@@ -1246,7 +1246,7 @@ int aha152x_host_reset_host(struct Scsi_Host *shpnt)
* Return the "logical geometry"
*
*/
-static int aha152x_biosparam(struct scsi_device *sdev, struct block_device *bdev,
+static int aha152x_biosparam(struct scsi_device *sdev, struct gendisk *disk,
sector_t capacity, int *info_array)
{
struct Scsi_Host *shpnt = sdev->host;
@@ -1261,7 +1261,7 @@ static int aha152x_biosparam(struct scsi_device *sdev, struct block_device *bdev
int info[3];
/* try to figure out the geometry from the partition table */
- if (scsicam_bios_param(bdev, capacity, info) < 0 ||
+ if (scsicam_bios_param(disk, capacity, info) < 0 ||
!((info[0] == 64 && info[1] == 32) || (info[0] == 255 && info[1] == 63))) {
if (EXT_TRANS) {
printk(KERN_NOTICE
diff --git a/drivers/scsi/aha1542.c b/drivers/scsi/aha1542.c
index 389499d3e00a..371e8300f029 100644
--- a/drivers/scsi/aha1542.c
+++ b/drivers/scsi/aha1542.c
@@ -992,7 +992,7 @@ static int aha1542_host_reset(struct scsi_cmnd *cmd)
}
static int aha1542_biosparam(struct scsi_device *sdev,
- struct block_device *bdev, sector_t capacity, int geom[])
+ struct gendisk *unused, sector_t capacity, int geom[])
{
struct aha1542_hostdata *aha1542 = shost_priv(sdev->host);
diff --git a/drivers/scsi/aha1740.c b/drivers/scsi/aha1740.c
index be7ebbbb9ba8..b234621f6b37 100644
--- a/drivers/scsi/aha1740.c
+++ b/drivers/scsi/aha1740.c
@@ -510,7 +510,7 @@ static void aha1740_getconfig(unsigned int base, unsigned int *irq_level,
}
static int aha1740_biosparam(struct scsi_device *sdev,
- struct block_device *dev,
+ struct gendisk *unused,
sector_t capacity, int* ip)
{
int size = capacity;
diff --git a/drivers/scsi/aic7xxx/aic79xx_osm.c b/drivers/scsi/aic7xxx/aic79xx_osm.c
index 17dfc3c72110..c3d1b9dd24ae 100644
--- a/drivers/scsi/aic7xxx/aic79xx_osm.c
+++ b/drivers/scsi/aic7xxx/aic79xx_osm.c
@@ -720,7 +720,7 @@ ahd_linux_sdev_configure(struct scsi_device *sdev, struct queue_limits *lim)
* Return the disk geometry for the given SCSI device.
*/
static int
-ahd_linux_biosparam(struct scsi_device *sdev, struct block_device *bdev,
+ahd_linux_biosparam(struct scsi_device *sdev, struct gendisk *disk,
sector_t capacity, int geom[])
{
int heads;
@@ -731,7 +731,7 @@ ahd_linux_biosparam(struct scsi_device *sdev, struct block_device *bdev,
ahd = *((struct ahd_softc **)sdev->host->hostdata);
- if (scsi_partsize(bdev, capacity, geom))
+ if (scsi_partsize(disk, capacity, geom))
return 0;
heads = 64;
diff --git a/drivers/scsi/aic7xxx/aic7xxx_osm.c b/drivers/scsi/aic7xxx/aic7xxx_osm.c
index cebf8c5d0caf..8b2b98666d61 100644
--- a/drivers/scsi/aic7xxx/aic7xxx_osm.c
+++ b/drivers/scsi/aic7xxx/aic7xxx_osm.c
@@ -683,7 +683,7 @@ ahc_linux_sdev_configure(struct scsi_device *sdev, struct queue_limits *lim)
* Return the disk geometry for the given SCSI device.
*/
static int
-ahc_linux_biosparam(struct scsi_device *sdev, struct block_device *bdev,
+ahc_linux_biosparam(struct scsi_device *sdev, struct gendisk *disk,
sector_t capacity, int geom[])
{
int heads;
@@ -696,7 +696,7 @@ ahc_linux_biosparam(struct scsi_device *sdev, struct block_device *bdev,
ahc = *((struct ahc_softc **)sdev->host->hostdata);
channel = sdev_channel(sdev);
- if (scsi_partsize(bdev, capacity, geom))
+ if (scsi_partsize(disk, capacity, geom))
return 0;
heads = 64;
diff --git a/drivers/scsi/arcmsr/arcmsr_hba.c b/drivers/scsi/arcmsr/arcmsr_hba.c
index fb57343a97bd..f0c5a30ce51b 100644
--- a/drivers/scsi/arcmsr/arcmsr_hba.c
+++ b/drivers/scsi/arcmsr/arcmsr_hba.c
@@ -112,7 +112,7 @@ static int arcmsr_iop_confirm(struct AdapterControlBlock *acb);
static int arcmsr_abort(struct scsi_cmnd *);
static int arcmsr_bus_reset(struct scsi_cmnd *);
static int arcmsr_bios_param(struct scsi_device *sdev,
- struct block_device *bdev, sector_t capacity, int *info);
+ struct gendisk *disk, sector_t capacity, int *info);
static int arcmsr_queue_command(struct Scsi_Host *h, struct scsi_cmnd *cmd);
static int arcmsr_probe(struct pci_dev *pdev,
const struct pci_device_id *id);
@@ -377,11 +377,11 @@ static irqreturn_t arcmsr_do_interrupt(int irq, void *dev_id)
}
static int arcmsr_bios_param(struct scsi_device *sdev,
- struct block_device *bdev, sector_t capacity, int *geom)
+ struct gendisk *disk, sector_t capacity, int *geom)
{
int heads, sectors, cylinders, total_capacity;
- if (scsi_partsize(bdev, capacity, geom))
+ if (scsi_partsize(disk, capacity, geom))
return 0;
total_capacity = capacity;
diff --git a/drivers/scsi/atp870u.c b/drivers/scsi/atp870u.c
index 401242912855..df6f40b51deb 100644
--- a/drivers/scsi/atp870u.c
+++ b/drivers/scsi/atp870u.c
@@ -1692,7 +1692,7 @@ static int atp870u_show_info(struct seq_file *m, struct Scsi_Host *HBAptr)
}
-static int atp870u_biosparam(struct scsi_device *disk, struct block_device *dev,
+static int atp870u_biosparam(struct scsi_device *disk, struct gendisk *unused,
sector_t capacity, int *ip)
{
int heads, sectors, cylinders;
diff --git a/drivers/scsi/fdomain.c b/drivers/scsi/fdomain.c
index 504c4e0c5d17..c0b2a980db34 100644
--- a/drivers/scsi/fdomain.c
+++ b/drivers/scsi/fdomain.c
@@ -469,10 +469,10 @@ static int fdomain_host_reset(struct scsi_cmnd *cmd)
}
static int fdomain_biosparam(struct scsi_device *sdev,
- struct block_device *bdev, sector_t capacity,
+ struct gendisk *disk, sector_t capacity,
int geom[])
{
- unsigned char *p = scsi_bios_ptable(bdev);
+ unsigned char *p = scsi_bios_ptable(disk);
if (p && p[65] == 0xaa && p[64] == 0x55 /* Partition table valid */
&& p[4]) { /* Partition type */
diff --git a/drivers/scsi/imm.c b/drivers/scsi/imm.c
index 0821cf994b98..5c602c057798 100644
--- a/drivers/scsi/imm.c
+++ b/drivers/scsi/imm.c
@@ -954,7 +954,7 @@ static DEF_SCSI_QCMD(imm_queuecommand)
* be done in sd.c. Even if it gets fixed there, this will still
* work.
*/
-static int imm_biosparam(struct scsi_device *sdev, struct block_device *dev,
+static int imm_biosparam(struct scsi_device *sdev, struct gendisk *unused,
sector_t capacity, int ip[])
{
ip[0] = 0x40;
diff --git a/drivers/scsi/initio.c b/drivers/scsi/initio.c
index 8648bd965287..ed34ad92c807 100644
--- a/drivers/scsi/initio.c
+++ b/drivers/scsi/initio.c
@@ -2645,7 +2645,7 @@ static int i91u_bus_reset(struct scsi_cmnd * cmnd)
/**
* i91u_biosparam - return the "logical geometry
* @sdev: SCSI device
- * @dev: Matching block device
+ * @unused: Matching gendisk
* @capacity: Sector size of drive
* @info_array: Return space for BIOS geometry
*
@@ -2655,7 +2655,7 @@ static int i91u_bus_reset(struct scsi_cmnd * cmnd)
* FIXME: limited to 2^32 sector devices.
*/
-static int i91u_biosparam(struct scsi_device *sdev, struct block_device *dev,
+static int i91u_biosparam(struct scsi_device *sdev, struct gendisk *unused,
sector_t capacity, int *info_array)
{
struct initio_host *host; /* Point to Host adapter control block */
diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index d06b79f03538..dd6754db7e4c 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -4644,10 +4644,10 @@ ATTRIBUTE_GROUPS(ipr_dev);
/**
* ipr_biosparam - Return the HSC mapping
- * @sdev: scsi device struct
- * @block_device: block device pointer
+ * @sdev: scsi device struct
+ * @unused: gendisk pointer
* @capacity: capacity of the device
- * @parm: Array containing returned HSC values.
+ * @parm: Array containing returned HSC values.
*
* This function generates the HSC parms that fdisk uses.
* We want to make sure we return something that places partitions
@@ -4657,7 +4657,7 @@ ATTRIBUTE_GROUPS(ipr_dev);
* 0 on success
**/
static int ipr_biosparam(struct scsi_device *sdev,
- struct block_device *block_device,
+ struct gendisk *unused,
sector_t capacity, int *parm)
{
int heads, sectors;
diff --git a/drivers/scsi/ips.c b/drivers/scsi/ips.c
index 94adb6ac02a4..3393a288fd23 100644
--- a/drivers/scsi/ips.c
+++ b/drivers/scsi/ips.c
@@ -1123,7 +1123,7 @@ static DEF_SCSI_QCMD(ips_queue)
/* Set bios geometry for the controller */
/* */
/****************************************************************************/
-static int ips_biosparam(struct scsi_device *sdev, struct block_device *bdev,
+static int ips_biosparam(struct scsi_device *sdev, struct gendisk *unused,
sector_t capacity, int geom[])
{
ips_ha_t *ha = (ips_ha_t *) sdev->host->hostdata;
diff --git a/drivers/scsi/ips.h b/drivers/scsi/ips.h
index 8ac932ec4444..30a4d4a580e9 100644
--- a/drivers/scsi/ips.h
+++ b/drivers/scsi/ips.h
@@ -398,7 +398,7 @@
/*
* Scsi_Host Template
*/
- static int ips_biosparam(struct scsi_device *sdev, struct block_device *bdev,
+ static int ips_biosparam(struct scsi_device *sdev, struct gendisk *unused,
sector_t capacity, int geom[]);
static int ips_sdev_configure(struct scsi_device *SDptr,
struct queue_limits *lim);
diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c
index 928723c90b75..ffa5b49aaf08 100644
--- a/drivers/scsi/libsas/sas_scsi_host.c
+++ b/drivers/scsi/libsas/sas_scsi_host.c
@@ -845,7 +845,7 @@ int sas_change_queue_depth(struct scsi_device *sdev, int depth)
EXPORT_SYMBOL_GPL(sas_change_queue_depth);
int sas_bios_param(struct scsi_device *scsi_dev,
- struct block_device *bdev,
+ struct gendisk *unused,
sector_t capacity, int *hsc)
{
hsc[0] = 255;
diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c
index 2006094af418..a00622c0c526 100644
--- a/drivers/scsi/megaraid.c
+++ b/drivers/scsi/megaraid.c
@@ -2780,7 +2780,7 @@ static inline void mega_create_proc_entry(int index, struct proc_dir_entry *pare
* Return the disk geometry for a particular disk
*/
static int
-megaraid_biosparam(struct scsi_device *sdev, struct block_device *bdev,
+megaraid_biosparam(struct scsi_device *sdev, struct gendisk *disk,
sector_t capacity, int geom[])
{
adapter_t *adapter;
@@ -2813,7 +2813,7 @@ megaraid_biosparam(struct scsi_device *sdev, struct block_device *bdev,
geom[2] = cylinders;
}
else {
- if (scsi_partsize(bdev, capacity, geom))
+ if (scsi_partsize(disk, capacity, geom))
return 0;
dev_info(&adapter->dev->dev,
diff --git a/drivers/scsi/megaraid.h b/drivers/scsi/megaraid.h
index 013fbfb911b9..d6bfd26a8843 100644
--- a/drivers/scsi/megaraid.h
+++ b/drivers/scsi/megaraid.h
@@ -975,7 +975,7 @@ static void mega_free_scb(adapter_t *, scb_t *);
static int megaraid_abort(struct scsi_cmnd *);
static int megaraid_reset(struct scsi_cmnd *);
static int megaraid_abort_and_reset(adapter_t *, struct scsi_cmnd *, int);
-static int megaraid_biosparam(struct scsi_device *, struct block_device *,
+static int megaraid_biosparam(struct scsi_device *, struct gendisk *,
sector_t, int []);
static int mega_build_sglist (adapter_t *adapter, scb_t *scb,
diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c
index 615e06fd4ee8..abbbc4b36cd1 100644
--- a/drivers/scsi/megaraid/megaraid_sas_base.c
+++ b/drivers/scsi/megaraid/megaraid_sas_base.c
@@ -3137,12 +3137,12 @@ static int megasas_reset_target(struct scsi_cmnd *scmd)
/**
* megasas_bios_param - Returns disk geometry for a disk
* @sdev: device handle
- * @bdev: block device
+ * @unused: gendisk
* @capacity: drive capacity
* @geom: geometry parameters
*/
static int
-megasas_bios_param(struct scsi_device *sdev, struct block_device *bdev,
+megasas_bios_param(struct scsi_device *sdev, struct gendisk *unused,
sector_t capacity, int geom[])
{
int heads;
diff --git a/drivers/scsi/mpi3mr/mpi3mr_os.c b/drivers/scsi/mpi3mr/mpi3mr_os.c
index e467b56949e9..3df52a3b435b 100644
--- a/drivers/scsi/mpi3mr/mpi3mr_os.c
+++ b/drivers/scsi/mpi3mr/mpi3mr_os.c
@@ -4031,7 +4031,7 @@ out:
/**
* mpi3mr_bios_param - BIOS param callback
* @sdev: SCSI device reference
- * @bdev: Block device reference
+ * @unused: gendisk reference
* @capacity: Capacity in logical sectors
* @params: Parameter array
*
@@ -4040,7 +4040,7 @@ out:
* Return: 0 always
*/
static int mpi3mr_bios_param(struct scsi_device *sdev,
- struct block_device *bdev, sector_t capacity, int params[])
+ struct gendisk *unused, sector_t capacity, int params[])
{
int heads;
int sectors;
diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
index 967af259118e..7092d0debef3 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
@@ -2754,7 +2754,7 @@ scsih_sdev_configure(struct scsi_device *sdev, struct queue_limits *lim)
/**
* scsih_bios_param - fetch head, sector, cylinder info for a disk
* @sdev: scsi device struct
- * @bdev: pointer to block device context
+ * @unused: pointer to gendisk
* @capacity: device size (in 512 byte sectors)
* @params: three element array to place output:
* params[0] number of heads (max 255)
@@ -2762,7 +2762,7 @@ scsih_sdev_configure(struct scsi_device *sdev, struct queue_limits *lim)
* params[2] number of cylinders
*/
static int
-scsih_bios_param(struct scsi_device *sdev, struct block_device *bdev,
+scsih_bios_param(struct scsi_device *sdev, struct gendisk *unused,
sector_t capacity, int params[])
{
int heads;
diff --git a/drivers/scsi/mvumi.c b/drivers/scsi/mvumi.c
index 96549e7f5705..bdc2f2f17753 100644
--- a/drivers/scsi/mvumi.c
+++ b/drivers/scsi/mvumi.c
@@ -2142,7 +2142,7 @@ static enum scsi_timeout_action mvumi_timed_out(struct scsi_cmnd *scmd)
}
static int
-mvumi_bios_param(struct scsi_device *sdev, struct block_device *bdev,
+mvumi_bios_param(struct scsi_device *sdev, struct gendisk *unused,
sector_t capacity, int geom[])
{
int heads, sectors;
diff --git a/drivers/scsi/myrb.c b/drivers/scsi/myrb.c
index 486db5b2f05d..b8453c0333dc 100644
--- a/drivers/scsi/myrb.c
+++ b/drivers/scsi/myrb.c
@@ -1745,7 +1745,7 @@ static void myrb_sdev_destroy(struct scsi_device *sdev)
kfree(sdev->hostdata);
}
-static int myrb_biosparam(struct scsi_device *sdev, struct block_device *bdev,
+static int myrb_biosparam(struct scsi_device *sdev, struct gendisk *unused,
sector_t capacity, int geom[])
{
struct myrb_hba *cb = shost_priv(sdev->host);
diff --git a/drivers/scsi/pcmcia/sym53c500_cs.c b/drivers/scsi/pcmcia/sym53c500_cs.c
index 278c78d066c4..a3b505240351 100644
--- a/drivers/scsi/pcmcia/sym53c500_cs.c
+++ b/drivers/scsi/pcmcia/sym53c500_cs.c
@@ -597,7 +597,7 @@ SYM53C500_host_reset(struct scsi_cmnd *SCpnt)
static int
SYM53C500_biosparm(struct scsi_device *disk,
- struct block_device *dev,
+ struct gendisk *unused,
sector_t capacity, int *info_array)
{
int size;
diff --git a/drivers/scsi/ppa.c b/drivers/scsi/ppa.c
index 1ed3171f1797..ea682f3044b6 100644
--- a/drivers/scsi/ppa.c
+++ b/drivers/scsi/ppa.c
@@ -845,7 +845,7 @@ static DEF_SCSI_QCMD(ppa_queuecommand)
* be done in sd.c. Even if it gets fixed there, this will still
* work.
*/
-static int ppa_biosparam(struct scsi_device *sdev, struct block_device *dev,
+static int ppa_biosparam(struct scsi_device *sdev, struct gendisk *unused,
sector_t capacity, int ip[])
{
ip[0] = 0x40;
diff --git a/drivers/scsi/qla1280.c b/drivers/scsi/qla1280.c
index 6af018f1ca22..ef841f643171 100644
--- a/drivers/scsi/qla1280.c
+++ b/drivers/scsi/qla1280.c
@@ -1023,7 +1023,7 @@ qla1280_eh_adapter_reset(struct scsi_cmnd *cmd)
}
static int
-qla1280_biosparam(struct scsi_device *sdev, struct block_device *bdev,
+qla1280_biosparam(struct scsi_device *sdev, struct gendisk *unused,
sector_t capacity, int geom[])
{
int heads, sectors, cylinders;
diff --git a/drivers/scsi/qlogicfas408.c b/drivers/scsi/qlogicfas408.c
index 3e065d5fc80c..1ce469b7db99 100644
--- a/drivers/scsi/qlogicfas408.c
+++ b/drivers/scsi/qlogicfas408.c
@@ -492,7 +492,7 @@ DEF_SCSI_QCMD(qlogicfas408_queuecommand)
* Return bios parameters
*/
-int qlogicfas408_biosparam(struct scsi_device *disk, struct block_device *dev,
+int qlogicfas408_biosparam(struct scsi_device *disk, struct gendisk *unused,
sector_t capacity, int ip[])
{
/* This should mimic the DOS Qlogic driver's behavior exactly */
diff --git a/drivers/scsi/qlogicfas408.h b/drivers/scsi/qlogicfas408.h
index a971db11d293..83ef86c71f2f 100644
--- a/drivers/scsi/qlogicfas408.h
+++ b/drivers/scsi/qlogicfas408.h
@@ -106,7 +106,7 @@ struct qlogicfas408_priv {
irqreturn_t qlogicfas408_ihandl(int irq, void *dev_id);
int qlogicfas408_queuecommand(struct Scsi_Host *h, struct scsi_cmnd * cmd);
int qlogicfas408_biosparam(struct scsi_device * disk,
- struct block_device *dev,
+ struct gendisk *unused,
sector_t capacity, int ip[]);
int qlogicfas408_abort(struct scsi_cmnd * cmd);
extern int qlogicfas408_host_reset(struct scsi_cmnd *cmd);
diff --git a/drivers/scsi/scsicam.c b/drivers/scsi/scsicam.c
index 19e6c3852d50..887de505bcf9 100644
--- a/drivers/scsi/scsicam.c
+++ b/drivers/scsi/scsicam.c
@@ -30,9 +30,9 @@
* starting at offset %0x1be.
* Returns: partition table in kmalloc(GFP_KERNEL) memory, or NULL on error.
*/
-unsigned char *scsi_bios_ptable(struct block_device *dev)
+unsigned char *scsi_bios_ptable(struct gendisk *dev)
{
- struct address_space *mapping = bdev_whole(dev)->bd_mapping;
+ struct address_space *mapping = dev->part0->bd_mapping;
unsigned char *res = NULL;
struct folio *folio;
@@ -48,7 +48,7 @@ EXPORT_SYMBOL(scsi_bios_ptable);
/**
* scsi_partsize - Parse cylinders/heads/sectors from PC partition table
- * @bdev: block device to parse
+ * @disk: gendisk of the disk to parse
* @capacity: size of the disk in sectors
* @geom: output in form of [hds, cylinders, sectors]
*
@@ -57,7 +57,7 @@ EXPORT_SYMBOL(scsi_bios_ptable);
*
* Returns: %false on failure, %true on success.
*/
-bool scsi_partsize(struct block_device *bdev, sector_t capacity, int geom[3])
+bool scsi_partsize(struct gendisk *disk, sector_t capacity, int geom[3])
{
int cyl, ext_cyl, end_head, end_cyl, end_sector;
unsigned int logical_end, physical_end, ext_physical_end;
@@ -65,7 +65,7 @@ bool scsi_partsize(struct block_device *bdev, sector_t capacity, int geom[3])
void *buf;
int ret = false;
- buf = scsi_bios_ptable(bdev);
+ buf = scsi_bios_ptable(disk);
if (!buf)
return false;
@@ -205,7 +205,7 @@ static int setsize(unsigned long capacity, unsigned int *cyls, unsigned int *hds
/**
* scsicam_bios_param - Determine geometry of a disk in cylinders/heads/sectors.
- * @bdev: which device
+ * @disk: which device
* @capacity: size of the disk in sectors
* @ip: return value: ip[0]=heads, ip[1]=sectors, ip[2]=cylinders
*
@@ -215,13 +215,13 @@ static int setsize(unsigned long capacity, unsigned int *cyls, unsigned int *hds
*
* Returns : -1 on failure, 0 on success.
*/
-int scsicam_bios_param(struct block_device *bdev, sector_t capacity, int *ip)
+int scsicam_bios_param(struct gendisk *disk, sector_t capacity, int *ip)
{
u64 capacity64 = capacity; /* Suppress gcc warning */
int ret = 0;
/* try to infer mapping from partition table */
- if (scsi_partsize(bdev, capacity, ip))
+ if (scsi_partsize(disk, capacity, ip))
return 0;
if (capacity64 < (1ULL << 32)) {
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 5b8668accf8e..00ad574ce61c 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1599,9 +1599,9 @@ static void sd_release(struct gendisk *disk)
scsi_device_put(sdev);
}
-static int sd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+static int sd_getgeo(struct gendisk *disk, struct hd_geometry *geo)
{
- struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk);
+ struct scsi_disk *sdkp = scsi_disk(disk);
struct scsi_device *sdp = sdkp->device;
struct Scsi_Host *host = sdp->host;
sector_t capacity = logical_to_sectors(sdp, sdkp->capacity);
@@ -1614,9 +1614,9 @@ static int sd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
/* override with calculated, extended default, or driver values */
if (host->hostt->bios_param)
- host->hostt->bios_param(sdp, bdev, capacity, diskinfo);
+ host->hostt->bios_param(sdp, disk, capacity, diskinfo);
else
- scsicam_bios_param(bdev, capacity, diskinfo);
+ scsicam_bios_param(disk, capacity, diskinfo);
geo->heads = diskinfo[0];
geo->sectors = diskinfo[1];
diff --git a/drivers/scsi/stex.c b/drivers/scsi/stex.c
index 63ed7f9aaa93..d8ad02c29320 100644
--- a/drivers/scsi/stex.c
+++ b/drivers/scsi/stex.c
@@ -1457,7 +1457,7 @@ static void stex_reset_work(struct work_struct *work)
}
static int stex_biosparam(struct scsi_device *sdev,
- struct block_device *bdev, sector_t capacity, int geom[])
+ struct gendisk *unused, sector_t capacity, int geom[])
{
int heads = 255, sectors = 63;
diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index d9e59204a9c3..dc51ea352198 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -1615,7 +1615,7 @@ static int storvsc_sdev_configure(struct scsi_device *sdevice,
return 0;
}
-static int storvsc_get_chs(struct scsi_device *sdev, struct block_device * bdev,
+static int storvsc_get_chs(struct scsi_device *sdev, struct gendisk *unused,
sector_t capacity, int *info)
{
sector_t nsect = capacity;
diff --git a/drivers/scsi/wd719x.c b/drivers/scsi/wd719x.c
index 5a380eecfc75..0c9987828774 100644
--- a/drivers/scsi/wd719x.c
+++ b/drivers/scsi/wd719x.c
@@ -544,7 +544,7 @@ static int wd719x_host_reset(struct scsi_cmnd *cmd)
return wd719x_chip_init(wd) == 0 ? SUCCESS : FAILED;
}
-static int wd719x_biosparam(struct scsi_device *sdev, struct block_device *bdev,
+static int wd719x_biosparam(struct scsi_device *sdev, struct gendisk *unused,
sector_t capacity, int geom[])
{
if (capacity >= 0x200000) {
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index f991cf759836..db4e09042469 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -861,7 +861,7 @@ new_bio:
bio = bio_kmalloc(nr_vecs, GFP_KERNEL);
if (!bio)
goto fail;
- bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs,
+ bio_init_inline(bio, NULL, nr_vecs,
rw ? REQ_OP_WRITE : REQ_OP_READ);
bio->bi_end_io = pscsi_bi_endio;
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 590cd29f3e86..e701e6d6e321 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -2084,7 +2084,7 @@ int bch2_btree_node_scrub(struct btree_trans *trans,
INIT_WORK(&scrub->work, btree_node_scrub_work);
- bio_init(&scrub->bio, ca->disk_sb.bdev, scrub->bio.bi_inline_vecs, vecs, REQ_OP_READ);
+ bio_init_inline(&scrub->bio, ca->disk_sb.bdev, vecs, REQ_OP_READ);
bch2_bio_map(&scrub->bio, scrub->buf, c->opts.btree_node_size);
scrub->bio.bi_iter.bi_sector = pick.ptr.offset;
scrub->bio.bi_end_io = btree_node_scrub_endio;
diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h
index 5e14d13568de..1b37780abfda 100644
--- a/fs/bcachefs/data_update.h
+++ b/fs/bcachefs/data_update.h
@@ -62,7 +62,6 @@ struct promote_op {
struct work_struct work;
struct data_update write;
- struct bio_vec bi_inline_vecs[]; /* must be last */
};
void bch2_data_update_to_text(struct printbuf *, struct data_update *);
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index ddfeb0dafc9d..474e0e867b68 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1627,14 +1627,14 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
- ja->bio[i] = kzalloc(struct_size(ja->bio[i], bio.bi_inline_vecs,
- nr_bvecs), GFP_KERNEL);
+ ja->bio[i] = kzalloc(sizeof(*ja->bio[i]) +
+ sizeof(struct bio_vec) * nr_bvecs, GFP_KERNEL);
if (!ja->bio[i])
return bch_err_throw(c, ENOMEM_dev_journal_init);
ja->bio[i]->ca = ca;
ja->bio[i]->buf_idx = i;
- bio_init(&ja->bio[i]->bio, NULL, ja->bio[i]->bio.bi_inline_vecs, nr_bvecs, 0);
+ bio_init_inline(&ja->bio[i]->bio, NULL, nr_bvecs, 0);
}
ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 9e028dbcc3d0..ce8888d518c3 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1071,7 +1071,7 @@ reread:
bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
if (!bio)
return bch_err_throw(c, ENOMEM_journal_read_bucket);
- bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ);
+ bio_init_inline(bio, ca->disk_sb.bdev, nr_bvecs, REQ_OP_READ);
bio->bi_iter.bi_sector = offset;
bch2_bio_map(bio, buf->data, sectors_read << 9);
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 6c2e1d647403..e3fcffb93d2b 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -232,7 +232,7 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
if (!bio)
return -BCH_ERR_ENOMEM_sb_bio_realloc;
- bio_init(bio, NULL, bio->bi_inline_vecs, nr_bvecs, 0);
+ bio_init_inline(bio, NULL, nr_bvecs, 0);
kfree(sb->bio);
sb->bio = bio;
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index b84f6af2eb4c..c06e41fd4d0a 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -337,8 +337,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
u64 copied = 0;
size_t orig_count;
- if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
- !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
+ if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1))
return -EINVAL;
if (dio->flags & IOMAP_DIO_WRITE) {
@@ -434,7 +433,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
- ret = bio_iov_iter_get_pages(bio, dio->submit.iter);
+ ret = bio_iov_iter_get_bdev_pages(bio, dio->submit.iter, iomap->bdev);
if (unlikely(ret)) {
/*
* We have to stop part way through an IO. We must fall
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index b69c294e3ef0..a05e3793f93a 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -231,7 +231,7 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length,
bio = bio_kmalloc(page_count, GFP_NOIO);
if (!bio)
return -ENOMEM;
- bio_init(bio, sb->s_bdev, bio->bi_inline_vecs, page_count, REQ_OP_READ);
+ bio_init_inline(bio, sb->s_bdev, page_count, REQ_OP_READ);
bio->bi_iter.bi_sector = block * (msblk->devblksize >> SECTOR_SHIFT);
for (i = 0; i < page_count; ++i) {
diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h
index 0a25716820fe..851254f36eb3 100644
--- a/include/linux/bio-integrity.h
+++ b/include/linux/bio-integrity.h
@@ -13,6 +13,7 @@ enum bip_flags {
BIP_CHECK_GUARD = 1 << 5, /* guard check */
BIP_CHECK_REFTAG = 1 << 6, /* reftag check */
BIP_CHECK_APPTAG = 1 << 7, /* apptag check */
+ BIP_P2P_DMA = 1 << 8, /* using P2P address */
};
struct bio_integrity_payload {
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 46ffac5caab7..a64a30131031 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -322,8 +322,8 @@ static inline void bio_next_folio(struct folio_iter *fi, struct bio *bio)
void bio_trim(struct bio *bio, sector_t offset, sector_t size);
extern struct bio *bio_split(struct bio *bio, int sectors,
gfp_t gfp, struct bio_set *bs);
-int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim,
- unsigned *segs, unsigned max_bytes);
+int bio_split_io_at(struct bio *bio, const struct queue_limits *lim,
+ unsigned *segs, unsigned max_bytes, unsigned len_align);
/**
* bio_next_split - get next @sectors from a bio, splitting if necessary
@@ -405,6 +405,11 @@ struct request_queue;
void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
unsigned short max_vecs, blk_opf_t opf);
+static inline void bio_init_inline(struct bio *bio, struct block_device *bdev,
+ unsigned short max_vecs, blk_opf_t opf)
+{
+ bio_init(bio, bdev, bio_inline_vecs(bio), max_vecs, opf);
+}
extern void bio_uninit(struct bio *);
void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf);
void bio_chain(struct bio *, struct bio *);
@@ -441,7 +446,14 @@ int submit_bio_wait(struct bio *bio);
int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data,
size_t len, enum req_op op);
-int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter);
+int bio_iov_iter_get_pages_aligned(struct bio *bio, struct iov_iter *iter,
+ unsigned len_align_mask);
+
+static inline int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
+{
+ return bio_iov_iter_get_pages_aligned(bio, iter, 0);
+}
+
void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter);
void __bio_release_pages(struct bio *bio, bool mark_dirty);
extern void bio_set_pages_dirty(struct bio *bio);
diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h
index 78fe2459e661..b659373788f6 100644
--- a/include/linux/blk-integrity.h
+++ b/include/linux/blk-integrity.h
@@ -27,6 +27,15 @@ static inline bool queue_limits_stack_integrity_bdev(struct queue_limits *t,
#ifdef CONFIG_BLK_DEV_INTEGRITY
int blk_rq_map_integrity_sg(struct request *, struct scatterlist *);
+
+static inline bool blk_rq_integrity_dma_unmap(struct request *req,
+ struct device *dma_dev, struct dma_iova_state *state,
+ size_t mapped_len)
+{
+ return blk_dma_unmap(req, dma_dev, state, mapped_len,
+ bio_integrity(req->bio)->bip_flags & BIP_P2P_DMA);
+}
+
int blk_rq_count_integrity_sg(struct request_queue *, struct bio *);
int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf,
ssize_t bytes);
@@ -115,6 +124,12 @@ static inline int blk_rq_map_integrity_sg(struct request *q,
{
return 0;
}
+static inline bool blk_rq_integrity_dma_unmap(struct request *req,
+ struct device *dma_dev, struct dma_iova_state *state,
+ size_t mapped_len)
+{
+ return false;
+}
static inline int blk_rq_integrity_map_user(struct request *rq,
void __user *ubuf,
ssize_t bytes)
diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h
index 0f45ea110ca1..51829958d872 100644
--- a/include/linux/blk-mq-dma.h
+++ b/include/linux/blk-mq-dma.h
@@ -43,7 +43,7 @@ static inline bool blk_rq_dma_map_coalesce(struct dma_iova_state *state)
}
/**
- * blk_rq_dma_unmap - try to DMA unmap a request
+ * blk_dma_unmap - try to DMA unmap a request
* @req: request to unmap
* @dma_dev: device to unmap from
* @state: DMA IOVA state
@@ -53,7 +53,7 @@ static inline bool blk_rq_dma_map_coalesce(struct dma_iova_state *state)
* Returns %false if the callers need to manually unmap every DMA segment
* mapped using @iter or %true if no work is left to be done.
*/
-static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev,
+static inline bool blk_dma_unmap(struct request *req, struct device *dma_dev,
struct dma_iova_state *state, size_t mapped_len, bool is_p2p)
{
if (is_p2p)
@@ -68,4 +68,11 @@ static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev,
return !dma_need_unmap(dma_dev);
}
+static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev,
+ struct dma_iova_state *state, size_t mapped_len)
+{
+ return blk_dma_unmap(req, dma_dev, state, mapped_len,
+ req->cmd_flags & REQ_P2PDMA);
+}
+
#endif /* BLK_MQ_DMA_H */
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 2a5a828f19a0..b25d12545f46 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -507,6 +507,8 @@ enum hctx_type {
* request_queue.tag_set_list.
* @srcu: Use as lock when type of the request queue is blocking
* (BLK_MQ_F_BLOCKING).
+ * @tags_srcu: SRCU used to defer freeing of tags page_list to prevent
+ * use-after-free when iterating tags.
* @update_nr_hwq_lock:
* Synchronize updating nr_hw_queues with add/del disk &
* switching elevator.
@@ -531,6 +533,7 @@ struct blk_mq_tag_set {
struct mutex tag_list_lock;
struct list_head tag_list;
struct srcu_struct *srcu;
+ struct srcu_struct tags_srcu;
struct rw_semaphore update_nr_hwq_lock;
};
@@ -767,6 +770,7 @@ struct blk_mq_tags {
* request pool
*/
spinlock_t lock;
+ struct rcu_head rcu_head;
};
static inline struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags,
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 930daff207df..8e8d1cc8b06c 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -198,10 +198,6 @@ static inline bool blk_path_error(blk_status_t error)
return true;
}
-struct bio_issue {
- u64 value;
-};
-
typedef __u32 __bitwise blk_opf_t;
typedef unsigned int blk_qc_t;
@@ -242,7 +238,8 @@ struct bio {
* on release of the bio.
*/
struct blkcg_gq *bi_blkg;
- struct bio_issue bi_issue;
+ /* Time that this bio was issued. */
+ u64 issue_time_ns;
#ifdef CONFIG_BLK_CGROUP_IOCOST
u64 bi_iocost_cost;
#endif
@@ -269,18 +266,16 @@ struct bio {
struct bio_vec *bi_io_vec; /* the actual vec list */
struct bio_set *bi_pool;
-
- /*
- * We can inline a number of vecs at the end of the bio, to avoid
- * double allocations for a small number of bio_vecs. This member
- * MUST obviously be kept at the very end of the bio.
- */
- struct bio_vec bi_inline_vecs[];
};
#define BIO_RESET_BYTES offsetof(struct bio, bi_max_vecs)
#define BIO_MAX_SECTORS (UINT_MAX >> SECTOR_SHIFT)
+static inline struct bio_vec *bio_inline_vecs(struct bio *bio)
+{
+ return (struct bio_vec *)(bio + 1);
+}
+
/*
* bio flags
*/
@@ -386,6 +381,7 @@ enum req_flag_bits {
__REQ_DRV, /* for driver use */
__REQ_FS_PRIVATE, /* for file system (submitter) use */
__REQ_ATOMIC, /* for atomic write operations */
+ __REQ_P2PDMA, /* contains P2P DMA pages */
/*
* Command specific flags, keep last:
*/
@@ -418,6 +414,7 @@ enum req_flag_bits {
#define REQ_DRV (__force blk_opf_t)(1ULL << __REQ_DRV)
#define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE)
#define REQ_ATOMIC (__force blk_opf_t)(1ULL << __REQ_ATOMIC)
+#define REQ_P2PDMA (__force blk_opf_t)(1ULL << __REQ_P2PDMA)
#define REQ_NOUNMAP (__force blk_opf_t)(1ULL << __REQ_NOUNMAP)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index fe1797bbec42..066e5309bd45 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -657,6 +657,7 @@ enum {
QUEUE_FLAG_DISABLE_WBT_DEF, /* for sched to disable/enable wbt */
QUEUE_FLAG_NO_ELV_SWITCH, /* can't switch elevator any more */
QUEUE_FLAG_QOS_ENABLED, /* qos is enabled */
+ QUEUE_FLAG_BIO_ISSUE_TIME, /* record bio->issue_time_ns */
QUEUE_FLAG_MAX
};
@@ -999,6 +1000,8 @@ extern int blk_register_queue(struct gendisk *disk);
extern void blk_unregister_queue(struct gendisk *disk);
void submit_bio_noacct(struct bio *bio);
struct bio *bio_split_to_limits(struct bio *bio);
+struct bio *bio_submit_split_bioset(struct bio *bio, unsigned int split_sectors,
+ struct bio_set *bs);
extern int blk_lld_busy(struct request_queue *q);
extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags);
@@ -1590,13 +1593,6 @@ static inline unsigned int bdev_dma_alignment(struct block_device *bdev)
return queue_dma_alignment(bdev_get_queue(bdev));
}
-static inline bool bdev_iter_is_aligned(struct block_device *bdev,
- struct iov_iter *iter)
-{
- return iov_iter_is_aligned(iter, bdev_dma_alignment(bdev),
- bdev_logical_block_size(bdev) - 1);
-}
-
static inline unsigned int
blk_lim_dma_alignment_and_pad(struct queue_limits *lim)
{
@@ -1660,7 +1656,7 @@ struct block_device_operations {
unsigned int (*check_events) (struct gendisk *disk,
unsigned int clearing);
void (*unlock_native_capacity) (struct gendisk *);
- int (*getgeo)(struct block_device *, struct hd_geometry *);
+ int (*getgeo)(struct gendisk *, struct hd_geometry *);
int (*set_read_only)(struct block_device *bdev, bool ro);
void (*free_disk)(struct gendisk *disk);
/* this callback is with swap_lock and sometimes page table lock held */
@@ -1870,6 +1866,20 @@ bdev_atomic_write_unit_max_bytes(struct block_device *bdev)
return queue_atomic_write_unit_max_bytes(bdev_get_queue(bdev));
}
+static inline int bio_split_rw_at(struct bio *bio,
+ const struct queue_limits *lim,
+ unsigned *segs, unsigned max_bytes)
+{
+ return bio_split_io_at(bio, lim, segs, max_bytes, lim->dma_alignment);
+}
+
+static inline int bio_iov_iter_get_bdev_pages(struct bio *bio,
+ struct iov_iter *iter, struct block_device *bdev)
+{
+ return bio_iov_iter_get_pages_aligned(bio, iter,
+ bdev_logical_block_size(bdev) - 1);
+}
+
#define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { }
#endif /* _LINUX_BLKDEV_H */
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 0620dd67369f..21de0935775d 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1203,7 +1203,7 @@ extern void ata_qc_complete(struct ata_queued_cmd *qc);
extern u64 ata_qc_get_active(struct ata_port *ap);
extern void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd);
extern int ata_std_bios_param(struct scsi_device *sdev,
- struct block_device *bdev,
+ struct gendisk *unused,
sector_t capacity, int geom[]);
extern void ata_scsi_unlock_native_capacity(struct scsi_device *sdev);
extern int ata_scsi_sdev_init(struct scsi_device *sdev);
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 2e86c653186c..5b127043a151 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -286,8 +286,6 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
#endif
size_t iov_iter_zero(size_t bytes, struct iov_iter *);
-bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask,
- unsigned len_mask);
unsigned long iov_iter_alignment(const struct iov_iter *i);
unsigned long iov_iter_gap_alignment(const struct iov_iter *i);
void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov,
diff --git a/include/scsi/libsas.h b/include/scsi/libsas.h
index ba460b6c0374..9c6e90829dbd 100644
--- a/include/scsi/libsas.h
+++ b/include/scsi/libsas.h
@@ -685,7 +685,7 @@ extern int sas_queuecommand(struct Scsi_Host *, struct scsi_cmnd *);
extern int sas_target_alloc(struct scsi_target *);
int sas_sdev_configure(struct scsi_device *dev, struct queue_limits *lim);
extern int sas_change_queue_depth(struct scsi_device *, int new_depth);
-extern int sas_bios_param(struct scsi_device *, struct block_device *,
+extern int sas_bios_param(struct scsi_device *, struct gendisk *,
sector_t capacity, int *hsc);
int sas_execute_internal_abort_single(struct domain_device *device,
u16 tag, unsigned int qid,
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index c53812b9026f..f5a243261236 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -318,7 +318,7 @@ struct scsi_host_template {
*
* Status: OPTIONAL
*/
- int (* bios_param)(struct scsi_device *, struct block_device *,
+ int (* bios_param)(struct scsi_device *, struct gendisk *,
sector_t, int []);
/*
diff --git a/include/scsi/scsicam.h b/include/scsi/scsicam.h
index 08edd603e521..1131f51ed2c8 100644
--- a/include/scsi/scsicam.h
+++ b/include/scsi/scsicam.h
@@ -13,7 +13,8 @@
#ifndef SCSICAM_H
#define SCSICAM_H
-int scsicam_bios_param(struct block_device *bdev, sector_t capacity, int *ip);
-bool scsi_partsize(struct block_device *bdev, sector_t capacity, int geom[3]);
-unsigned char *scsi_bios_ptable(struct block_device *bdev);
+struct gendisk;
+int scsicam_bios_param(struct gendisk *disk, sector_t capacity, int *ip);
+bool scsi_partsize(struct gendisk *disk, sector_t capacity, int geom[3]);
+unsigned char *scsi_bios_ptable(struct gendisk *disk);
#endif /* def SCSICAM_H */
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index f9193f952f49..2fe66a6b8789 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -784,101 +784,6 @@ void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
}
EXPORT_SYMBOL(iov_iter_discard);
-static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask,
- unsigned len_mask)
-{
- const struct iovec *iov = iter_iov(i);
- size_t size = i->count;
- size_t skip = i->iov_offset;
-
- do {
- size_t len = iov->iov_len - skip;
-
- if (len > size)
- len = size;
- if (len & len_mask)
- return false;
- if ((unsigned long)(iov->iov_base + skip) & addr_mask)
- return false;
-
- iov++;
- size -= len;
- skip = 0;
- } while (size);
-
- return true;
-}
-
-static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask,
- unsigned len_mask)
-{
- const struct bio_vec *bvec = i->bvec;
- unsigned skip = i->iov_offset;
- size_t size = i->count;
-
- do {
- size_t len = bvec->bv_len - skip;
-
- if (len > size)
- len = size;
- if (len & len_mask)
- return false;
- if ((unsigned long)(bvec->bv_offset + skip) & addr_mask)
- return false;
-
- bvec++;
- size -= len;
- skip = 0;
- } while (size);
-
- return true;
-}
-
-/**
- * iov_iter_is_aligned() - Check if the addresses and lengths of each segments
- * are aligned to the parameters.
- *
- * @i: &struct iov_iter to restore
- * @addr_mask: bit mask to check against the iov element's addresses
- * @len_mask: bit mask to check against the iov element's lengths
- *
- * Return: false if any addresses or lengths intersect with the provided masks
- */
-bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask,
- unsigned len_mask)
-{
- if (likely(iter_is_ubuf(i))) {
- if (i->count & len_mask)
- return false;
- if ((unsigned long)(i->ubuf + i->iov_offset) & addr_mask)
- return false;
- return true;
- }
-
- if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
- return iov_iter_aligned_iovec(i, addr_mask, len_mask);
-
- if (iov_iter_is_bvec(i))
- return iov_iter_aligned_bvec(i, addr_mask, len_mask);
-
- /* With both xarray and folioq types, we're dealing with whole folios. */
- if (iov_iter_is_xarray(i)) {
- if (i->count & len_mask)
- return false;
- if ((i->xarray_start + i->iov_offset) & addr_mask)
- return false;
- }
- if (iov_iter_is_folioq(i)) {
- if (i->count & len_mask)
- return false;
- if (i->iov_offset & addr_mask)
- return false;
- }
-
- return true;
-}
-EXPORT_SYMBOL_GPL(iov_iter_is_aligned);
-
static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i)
{
const struct iovec *iov = iter_iov(i);
diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile
index 5d7f4ecfb816..770269efe42a 100644
--- a/tools/testing/selftests/ublk/Makefile
+++ b/tools/testing/selftests/ublk/Makefile
@@ -20,6 +20,7 @@ TEST_PROGS += test_generic_09.sh
TEST_PROGS += test_generic_10.sh
TEST_PROGS += test_generic_11.sh
TEST_PROGS += test_generic_12.sh
+TEST_PROGS += test_generic_13.sh
TEST_PROGS += test_null_01.sh
TEST_PROGS += test_null_02.sh
diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c
index 6512dfbdbce3..b636d40b4889 100644
--- a/tools/testing/selftests/ublk/kublk.c
+++ b/tools/testing/selftests/ublk/kublk.c
@@ -1363,21 +1363,23 @@ static int cmd_dev_list(struct dev_ctx *ctx)
static int cmd_dev_get_features(void)
{
#define const_ilog2(x) (63 - __builtin_clzll(x))
+#define FEAT_NAME(f) [const_ilog2(f)] = #f
static const char *feat_map[] = {
- [const_ilog2(UBLK_F_SUPPORT_ZERO_COPY)] = "ZERO_COPY",
- [const_ilog2(UBLK_F_URING_CMD_COMP_IN_TASK)] = "COMP_IN_TASK",
- [const_ilog2(UBLK_F_NEED_GET_DATA)] = "GET_DATA",
- [const_ilog2(UBLK_F_USER_RECOVERY)] = "USER_RECOVERY",
- [const_ilog2(UBLK_F_USER_RECOVERY_REISSUE)] = "RECOVERY_REISSUE",
- [const_ilog2(UBLK_F_UNPRIVILEGED_DEV)] = "UNPRIVILEGED_DEV",
- [const_ilog2(UBLK_F_CMD_IOCTL_ENCODE)] = "CMD_IOCTL_ENCODE",
- [const_ilog2(UBLK_F_USER_COPY)] = "USER_COPY",
- [const_ilog2(UBLK_F_ZONED)] = "ZONED",
- [const_ilog2(UBLK_F_USER_RECOVERY_FAIL_IO)] = "RECOVERY_FAIL_IO",
- [const_ilog2(UBLK_F_UPDATE_SIZE)] = "UPDATE_SIZE",
- [const_ilog2(UBLK_F_AUTO_BUF_REG)] = "AUTO_BUF_REG",
- [const_ilog2(UBLK_F_QUIESCE)] = "QUIESCE",
- [const_ilog2(UBLK_F_PER_IO_DAEMON)] = "PER_IO_DAEMON",
+ FEAT_NAME(UBLK_F_SUPPORT_ZERO_COPY),
+ FEAT_NAME(UBLK_F_URING_CMD_COMP_IN_TASK),
+ FEAT_NAME(UBLK_F_NEED_GET_DATA),
+ FEAT_NAME(UBLK_F_USER_RECOVERY),
+ FEAT_NAME(UBLK_F_USER_RECOVERY_REISSUE),
+ FEAT_NAME(UBLK_F_UNPRIVILEGED_DEV),
+ FEAT_NAME(UBLK_F_CMD_IOCTL_ENCODE),
+ FEAT_NAME(UBLK_F_USER_COPY),
+ FEAT_NAME(UBLK_F_ZONED),
+ FEAT_NAME(UBLK_F_USER_RECOVERY_FAIL_IO),
+ FEAT_NAME(UBLK_F_UPDATE_SIZE),
+ FEAT_NAME(UBLK_F_AUTO_BUF_REG),
+ FEAT_NAME(UBLK_F_QUIESCE),
+ FEAT_NAME(UBLK_F_PER_IO_DAEMON),
+ FEAT_NAME(UBLK_F_BUF_REG_OFF_DAEMON),
};
struct ublk_dev *dev;
__u64 features = 0;
@@ -1404,7 +1406,7 @@ static int cmd_dev_get_features(void)
feat = feat_map[i];
else
feat = "unknown";
- printf("\t%-20s: 0x%llx\n", feat, 1ULL << i);
+ printf("0x%-16llx: %s\n", 1ULL << i, feat);
}
}
diff --git a/tools/testing/selftests/ublk/test_generic_01.sh b/tools/testing/selftests/ublk/test_generic_01.sh
index 9227a208ba53..21a31cd5491a 100755
--- a/tools/testing/selftests/ublk/test_generic_01.sh
+++ b/tools/testing/selftests/ublk/test_generic_01.sh
@@ -10,6 +10,10 @@ if ! _have_program bpftrace; then
exit "$UBLK_SKIP_CODE"
fi
+if ! _have_program fio; then
+ exit "$UBLK_SKIP_CODE"
+fi
+
_prep_test "null" "sequential io order"
dev_id=$(_add_ublk_dev -t null)
diff --git a/tools/testing/selftests/ublk/test_generic_02.sh b/tools/testing/selftests/ublk/test_generic_02.sh
index 3e80121e3bf5..12920768b1a0 100755
--- a/tools/testing/selftests/ublk/test_generic_02.sh
+++ b/tools/testing/selftests/ublk/test_generic_02.sh
@@ -10,6 +10,10 @@ if ! _have_program bpftrace; then
exit "$UBLK_SKIP_CODE"
fi
+if ! _have_program fio; then
+ exit "$UBLK_SKIP_CODE"
+fi
+
_prep_test "null" "sequential io order for MQ"
dev_id=$(_add_ublk_dev -t null -q 2)
diff --git a/tools/testing/selftests/ublk/test_generic_12.sh b/tools/testing/selftests/ublk/test_generic_12.sh
index 7abbb00d251d..b4046201b4d9 100755
--- a/tools/testing/selftests/ublk/test_generic_12.sh
+++ b/tools/testing/selftests/ublk/test_generic_12.sh
@@ -10,6 +10,10 @@ if ! _have_program bpftrace; then
exit "$UBLK_SKIP_CODE"
fi
+if ! _have_program fio; then
+ exit "$UBLK_SKIP_CODE"
+fi
+
_prep_test "null" "do imbalanced load, it should be balanced over I/O threads"
NTHREADS=6
diff --git a/tools/testing/selftests/ublk/test_generic_13.sh b/tools/testing/selftests/ublk/test_generic_13.sh
new file mode 100755
index 000000000000..b7aa90b1cb74
--- /dev/null
+++ b/tools/testing/selftests/ublk/test_generic_13.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
+
+TID="generic_13"
+ERR_CODE=0
+
+_prep_test "null" "check that feature list is complete"
+
+if ${UBLK_PROG} features | grep -q unknown; then
+ echo "# unknown feature detected!"
+ echo "# did you add a feature and forget to update feat_map in kublk?"
+ echo "# this failure is expected if running an older test suite against"
+ echo "# a newer kernel with new features added"
+ ERR_CODE=255
+fi
+
+_cleanup_test "null"
+_show_result $TID $ERR_CODE
diff --git a/tools/testing/selftests/ublk/test_null_01.sh b/tools/testing/selftests/ublk/test_null_01.sh
index a34203f72668..c2cb8f7a09fe 100755
--- a/tools/testing/selftests/ublk/test_null_01.sh
+++ b/tools/testing/selftests/ublk/test_null_01.sh
@@ -6,6 +6,10 @@
TID="null_01"
ERR_CODE=0
+if ! _have_program fio; then
+ exit "$UBLK_SKIP_CODE"
+fi
+
_prep_test "null" "basic IO test"
dev_id=$(_add_ublk_dev -t null)
diff --git a/tools/testing/selftests/ublk/test_null_02.sh b/tools/testing/selftests/ublk/test_null_02.sh
index 5633ca876655..8accd35beb55 100755
--- a/tools/testing/selftests/ublk/test_null_02.sh
+++ b/tools/testing/selftests/ublk/test_null_02.sh
@@ -6,6 +6,10 @@
TID="null_02"
ERR_CODE=0
+if ! _have_program fio; then
+ exit "$UBLK_SKIP_CODE"
+fi
+
_prep_test "null" "basic IO test with zero copy"
dev_id=$(_add_ublk_dev -t null -z)
diff --git a/tools/testing/selftests/ublk/test_stress_05.sh b/tools/testing/selftests/ublk/test_stress_05.sh
index 566cfd90d192..274295061042 100755
--- a/tools/testing/selftests/ublk/test_stress_05.sh
+++ b/tools/testing/selftests/ublk/test_stress_05.sh
@@ -5,6 +5,10 @@
TID="stress_05"
ERR_CODE=0
+if ! _have_program fio; then
+ exit "$UBLK_SKIP_CODE"
+fi
+
run_io_and_remove()
{
local size=$1