From 0e5e4f0e56aca0df1d5648db0be9028bd573b25c Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 9 Nov 2012 16:33:05 -0700 Subject: NVMe: Add discard support for capable devices This adds discard support to block queues if the nvme device is capable of deallocating blocks as indicated by the controller's optional command support. A discard flagged bio request will submit an NVMe deallocate Data Set Management command for the requested blocks. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++- include/linux/nvme.h | 32 ++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+), 1 deletion(-) diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index 9dcefe40380..26e26607207 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c @@ -80,6 +80,7 @@ struct nvme_dev { char model[40]; char firmware_rev[8]; u32 max_hw_sectors; + u16 oncs; }; /* @@ -510,6 +511,44 @@ static int nvme_map_bio(struct device *dev, struct nvme_iod *iod, return length; } +/* + * We reuse the small pool to allocate the 16-byte range here as it is not + * worth having a special pool for these or additional cases to handle freeing + * the iod. + */ +static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, + struct bio *bio, struct nvme_iod *iod, int cmdid) +{ + struct nvme_dsm_range *range; + struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; + + range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC, + &iod->first_dma); + if (!range) + return -ENOMEM; + + iod_list(iod)[0] = (__le64 *)range; + iod->npages = 0; + + range->cattr = cpu_to_le32(0); + range->nlb = cpu_to_le32(bio->bi_size >> ns->lba_shift); + range->slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9)); + + memset(cmnd, 0, sizeof(*cmnd)); + cmnd->dsm.opcode = nvme_cmd_dsm; + cmnd->dsm.command_id = cmdid; + cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); + cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma); + cmnd->dsm.nr = 0; + cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); + + if (++nvmeq->sq_tail == nvmeq->q_depth) + nvmeq->sq_tail = 0; + writel(nvmeq->sq_tail, nvmeq->q_db); + + return 0; +} + static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, int cmdid) { @@ -567,6 +606,12 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, if (unlikely(cmdid < 0)) goto free_iod; + if (bio->bi_rw & REQ_DISCARD) { + result = nvme_submit_discard(nvmeq, ns, bio, iod, cmdid); + if (result) + goto free_cmdid; + return result; + } if ((bio->bi_rw & REQ_FLUSH) && !psegs) return nvme_submit_flush(nvmeq, ns, cmdid); @@ -1347,6 +1392,16 @@ static void nvme_put_ns_idx(int index) spin_unlock(&dev_list_lock); } +static void nvme_config_discard(struct nvme_ns *ns) +{ + u32 logical_block_size = queue_logical_block_size(ns->queue); + ns->queue->limits.discard_zeroes_data = 0; + ns->queue->limits.discard_alignment = logical_block_size; + ns->queue->limits.discard_granularity = logical_block_size; + ns->queue->limits.max_discard_sectors = 0xffffffff; + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); +} + static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid, struct nvme_id_ns *id, struct nvme_lba_range_type *rt) { @@ -1366,7 +1421,6 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid, ns->queue->queue_flags = QUEUE_FLAG_DEFAULT; queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); -/* queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); */ blk_queue_make_request(ns->queue, nvme_make_request); ns->dev = dev; ns->queue->queuedata = ns; @@ -1392,6 +1446,9 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid, sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid); set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); + if (dev->oncs & NVME_CTRL_ONCS_DSM) + nvme_config_discard(ns); + return ns; out_free_queue: @@ -1520,6 +1577,7 @@ static int nvme_dev_add(struct nvme_dev *dev) ctrl = mem; nn = le32_to_cpup(&ctrl->nn); + dev->oncs = le16_to_cpup(&ctrl->oncs); memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 4fa3b0b9b07..bde44c1fd21 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -107,6 +107,12 @@ struct nvme_id_ctrl { __u8 vs[1024]; }; +enum { + NVME_CTRL_ONCS_COMPARE = 1 << 0, + NVME_CTRL_ONCS_WRITE_UNCORRECTABLE = 1 << 1, + NVME_CTRL_ONCS_DSM = 1 << 2, +}; + struct nvme_lbaf { __le16 ms; __u8 ds; @@ -246,6 +252,31 @@ enum { NVME_RW_DSM_COMPRESSED = 1 << 7, }; +struct nvme_dsm_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[2]; + __le64 prp1; + __le64 prp2; + __le32 nr; + __le32 attributes; + __u32 rsvd12[4]; +}; + +enum { + NVME_DSMGMT_IDR = 1 << 0, + NVME_DSMGMT_IDW = 1 << 1, + NVME_DSMGMT_AD = 1 << 2, +}; + +struct nvme_dsm_range { + __le32 cattr; + __le32 nlb; + __le64 slba; +}; + /* Admin commands */ enum nvme_admin_opcode { @@ -372,6 +403,7 @@ struct nvme_command { struct nvme_create_sq create_sq; struct nvme_delete_queue delete_queue; struct nvme_download_firmware dlfw; + struct nvme_dsm_cmd dsm; }; }; -- cgit v1.2.3 From 729dd1bd802acb973eec9c73ccb87d3143c13937 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Mon, 4 Mar 2013 18:40:56 -0700 Subject: NVMe: Rename nvme.c to nvme-core.c In preparation for adding nvme-scsi.c It is preferable to retain the module name 'nvme' Signed-off-by: Vishal Verma Signed-off-by: Matthew Wilcox --- drivers/block/Makefile | 1 + drivers/block/nvme-core.c | 1865 +++++++++++++++++++++++++++++++++++++++++++++ drivers/block/nvme.c | 1865 --------------------------------------------- 3 files changed, 1866 insertions(+), 1865 deletions(-) create mode 100644 drivers/block/nvme-core.c delete mode 100644 drivers/block/nvme.c diff --git a/drivers/block/Makefile b/drivers/block/Makefile index a3b40232c6a..2a41c86d3ad 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -42,4 +42,5 @@ obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/ obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/ +nvme-y := nvme-core.o swim_mod-y := swim.o swim_asm.o diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c new file mode 100644 index 00000000000..26e26607207 --- /dev/null +++ b/drivers/block/nvme-core.c @@ -0,0 +1,1865 @@ +/* + * NVM Express device driver + * Copyright (c) 2011, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define NVME_Q_DEPTH 1024 +#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) +#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) +#define NVME_MINORS 64 +#define NVME_IO_TIMEOUT (5 * HZ) +#define ADMIN_TIMEOUT (60 * HZ) + +static int nvme_major; +module_param(nvme_major, int, 0); + +static int use_threaded_interrupts; +module_param(use_threaded_interrupts, int, 0); + +static DEFINE_SPINLOCK(dev_list_lock); +static LIST_HEAD(dev_list); +static struct task_struct *nvme_thread; + +/* + * Represents an NVM Express device. Each nvme_dev is a PCI function. + */ +struct nvme_dev { + struct list_head node; + struct nvme_queue **queues; + u32 __iomem *dbs; + struct pci_dev *pci_dev; + struct dma_pool *prp_page_pool; + struct dma_pool *prp_small_pool; + int instance; + int queue_count; + int db_stride; + u32 ctrl_config; + struct msix_entry *entry; + struct nvme_bar __iomem *bar; + struct list_head namespaces; + char serial[20]; + char model[40]; + char firmware_rev[8]; + u32 max_hw_sectors; + u16 oncs; +}; + +/* + * An NVM Express namespace is equivalent to a SCSI LUN + */ +struct nvme_ns { + struct list_head list; + + struct nvme_dev *dev; + struct request_queue *queue; + struct gendisk *disk; + + int ns_id; + int lba_shift; +}; + +/* + * An NVM Express queue. Each device has at least two (one for admin + * commands and one for I/O commands). + */ +struct nvme_queue { + struct device *q_dmadev; + struct nvme_dev *dev; + spinlock_t q_lock; + struct nvme_command *sq_cmds; + volatile struct nvme_completion *cqes; + dma_addr_t sq_dma_addr; + dma_addr_t cq_dma_addr; + wait_queue_head_t sq_full; + wait_queue_t sq_cong_wait; + struct bio_list sq_cong; + u32 __iomem *q_db; + u16 q_depth; + u16 cq_vector; + u16 sq_head; + u16 sq_tail; + u16 cq_head; + u16 cq_phase; + unsigned long cmdid_data[]; +}; + +/* + * Check we didin't inadvertently grow the command struct + */ +static inline void _nvme_check_size(void) +{ + BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64); + BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64); + BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); + BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); + BUILD_BUG_ON(sizeof(struct nvme_features) != 64); + BUILD_BUG_ON(sizeof(struct nvme_command) != 64); + BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); + BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); + BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); + BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); +} + +typedef void (*nvme_completion_fn)(struct nvme_dev *, void *, + struct nvme_completion *); + +struct nvme_cmd_info { + nvme_completion_fn fn; + void *ctx; + unsigned long timeout; +}; + +static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq) +{ + return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)]; +} + +/** + * alloc_cmdid() - Allocate a Command ID + * @nvmeq: The queue that will be used for this command + * @ctx: A pointer that will be passed to the handler + * @handler: The function to call on completion + * + * Allocate a Command ID for a queue. The data passed in will + * be passed to the completion handler. This is implemented by using + * the bottom two bits of the ctx pointer to store the handler ID. + * Passing in a pointer that's not 4-byte aligned will cause a BUG. + * We can change this if it becomes a problem. + * + * May be called with local interrupts disabled and the q_lock held, + * or with interrupts enabled and no locks held. + */ +static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, + nvme_completion_fn handler, unsigned timeout) +{ + int depth = nvmeq->q_depth - 1; + struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); + int cmdid; + + do { + cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth); + if (cmdid >= depth) + return -EBUSY; + } while (test_and_set_bit(cmdid, nvmeq->cmdid_data)); + + info[cmdid].fn = handler; + info[cmdid].ctx = ctx; + info[cmdid].timeout = jiffies + timeout; + return cmdid; +} + +static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx, + nvme_completion_fn handler, unsigned timeout) +{ + int cmdid; + wait_event_killable(nvmeq->sq_full, + (cmdid = alloc_cmdid(nvmeq, ctx, handler, timeout)) >= 0); + return (cmdid < 0) ? -EINTR : cmdid; +} + +/* Special values must be less than 0x1000 */ +#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA) +#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) +#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) +#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) +#define CMD_CTX_FLUSH (0x318 + CMD_CTX_BASE) + +static void special_completion(struct nvme_dev *dev, void *ctx, + struct nvme_completion *cqe) +{ + if (ctx == CMD_CTX_CANCELLED) + return; + if (ctx == CMD_CTX_FLUSH) + return; + if (ctx == CMD_CTX_COMPLETED) { + dev_warn(&dev->pci_dev->dev, + "completed id %d twice on queue %d\n", + cqe->command_id, le16_to_cpup(&cqe->sq_id)); + return; + } + if (ctx == CMD_CTX_INVALID) { + dev_warn(&dev->pci_dev->dev, + "invalid id %d completed on queue %d\n", + cqe->command_id, le16_to_cpup(&cqe->sq_id)); + return; + } + + dev_warn(&dev->pci_dev->dev, "Unknown special completion %p\n", ctx); +} + +/* + * Called with local interrupts disabled and the q_lock held. May not sleep. + */ +static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid, + nvme_completion_fn *fn) +{ + void *ctx; + struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); + + if (cmdid >= nvmeq->q_depth) { + *fn = special_completion; + return CMD_CTX_INVALID; + } + if (fn) + *fn = info[cmdid].fn; + ctx = info[cmdid].ctx; + info[cmdid].fn = special_completion; + info[cmdid].ctx = CMD_CTX_COMPLETED; + clear_bit(cmdid, nvmeq->cmdid_data); + wake_up(&nvmeq->sq_full); + return ctx; +} + +static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid, + nvme_completion_fn *fn) +{ + void *ctx; + struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); + if (fn) + *fn = info[cmdid].fn; + ctx = info[cmdid].ctx; + info[cmdid].fn = special_completion; + info[cmdid].ctx = CMD_CTX_CANCELLED; + return ctx; +} + +static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) +{ + return dev->queues[get_cpu() + 1]; +} + +static void put_nvmeq(struct nvme_queue *nvmeq) +{ + put_cpu(); +} + +/** + * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell + * @nvmeq: The queue to use + * @cmd: The command to send + * + * Safe to use from interrupt context + */ +static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) +{ + unsigned long flags; + u16 tail; + spin_lock_irqsave(&nvmeq->q_lock, flags); + tail = nvmeq->sq_tail; + memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); + if (++tail == nvmeq->q_depth) + tail = 0; + writel(tail, nvmeq->q_db); + nvmeq->sq_tail = tail; + spin_unlock_irqrestore(&nvmeq->q_lock, flags); + + return 0; +} + +/* + * The nvme_iod describes the data in an I/O, including the list of PRP + * entries. You can't see it in this data structure because C doesn't let + * me express that. Use nvme_alloc_iod to ensure there's enough space + * allocated to store the PRP list. + */ +struct nvme_iod { + void *private; /* For the use of the submitter of the I/O */ + int npages; /* In the PRP list. 0 means small pool in use */ + int offset; /* Of PRP list */ + int nents; /* Used in scatterlist */ + int length; /* Of data, in bytes */ + dma_addr_t first_dma; + struct scatterlist sg[0]; +}; + +static __le64 **iod_list(struct nvme_iod *iod) +{ + return ((void *)iod) + iod->offset; +} + +/* + * Will slightly overestimate the number of pages needed. This is OK + * as it only leads to a small amount of wasted memory for the lifetime of + * the I/O. + */ +static int nvme_npages(unsigned size) +{ + unsigned nprps = DIV_ROUND_UP(size + PAGE_SIZE, PAGE_SIZE); + return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); +} + +static struct nvme_iod * +nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp) +{ + struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + + sizeof(__le64 *) * nvme_npages(nbytes) + + sizeof(struct scatterlist) * nseg, gfp); + + if (iod) { + iod->offset = offsetof(struct nvme_iod, sg[nseg]); + iod->npages = -1; + iod->length = nbytes; + iod->nents = 0; + } + + return iod; +} + +static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) +{ + const int last_prp = PAGE_SIZE / 8 - 1; + int i; + __le64 **list = iod_list(iod); + dma_addr_t prp_dma = iod->first_dma; + + if (iod->npages == 0) + dma_pool_free(dev->prp_small_pool, list[0], prp_dma); + for (i = 0; i < iod->npages; i++) { + __le64 *prp_list = list[i]; + dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]); + dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); + prp_dma = next_prp_dma; + } + kfree(iod); +} + +static void requeue_bio(struct nvme_dev *dev, struct bio *bio) +{ + struct nvme_queue *nvmeq = get_nvmeq(dev); + if (bio_list_empty(&nvmeq->sq_cong)) + add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); + bio_list_add(&nvmeq->sq_cong, bio); + put_nvmeq(nvmeq); + wake_up_process(nvme_thread); +} + +static void bio_completion(struct nvme_dev *dev, void *ctx, + struct nvme_completion *cqe) +{ + struct nvme_iod *iod = ctx; + struct bio *bio = iod->private; + u16 status = le16_to_cpup(&cqe->status) >> 1; + + if (iod->nents) + dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, + bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + nvme_free_iod(dev, iod); + if (status) { + bio_endio(bio, -EIO); + } else if (bio->bi_vcnt > bio->bi_idx) { + requeue_bio(dev, bio); + } else { + bio_endio(bio, 0); + } +} + +/* length is in bytes. gfp flags indicates whether we may sleep. */ +static int nvme_setup_prps(struct nvme_dev *dev, + struct nvme_common_command *cmd, struct nvme_iod *iod, + int total_len, gfp_t gfp) +{ + struct dma_pool *pool; + int length = total_len; + struct scatterlist *sg = iod->sg; + int dma_len = sg_dma_len(sg); + u64 dma_addr = sg_dma_address(sg); + int offset = offset_in_page(dma_addr); + __le64 *prp_list; + __le64 **list = iod_list(iod); + dma_addr_t prp_dma; + int nprps, i; + + cmd->prp1 = cpu_to_le64(dma_addr); + length -= (PAGE_SIZE - offset); + if (length <= 0) + return total_len; + + dma_len -= (PAGE_SIZE - offset); + if (dma_len) { + dma_addr += (PAGE_SIZE - offset); + } else { + sg = sg_next(sg); + dma_addr = sg_dma_address(sg); + dma_len = sg_dma_len(sg); + } + + if (length <= PAGE_SIZE) { + cmd->prp2 = cpu_to_le64(dma_addr); + return total_len; + } + + nprps = DIV_ROUND_UP(length, PAGE_SIZE); + if (nprps <= (256 / 8)) { + pool = dev->prp_small_pool; + iod->npages = 0; + } else { + pool = dev->prp_page_pool; + iod->npages = 1; + } + + prp_list = dma_pool_alloc(pool, gfp, &prp_dma); + if (!prp_list) { + cmd->prp2 = cpu_to_le64(dma_addr); + iod->npages = -1; + return (total_len - length) + PAGE_SIZE; + } + list[0] = prp_list; + iod->first_dma = prp_dma; + cmd->prp2 = cpu_to_le64(prp_dma); + i = 0; + for (;;) { + if (i == PAGE_SIZE / 8) { + __le64 *old_prp_list = prp_list; + prp_list = dma_pool_alloc(pool, gfp, &prp_dma); + if (!prp_list) + return total_len - length; + list[iod->npages++] = prp_list; + prp_list[0] = old_prp_list[i - 1]; + old_prp_list[i - 1] = cpu_to_le64(prp_dma); + i = 1; + } + prp_list[i++] = cpu_to_le64(dma_addr); + dma_len -= PAGE_SIZE; + dma_addr += PAGE_SIZE; + length -= PAGE_SIZE; + if (length <= 0) + break; + if (dma_len > 0) + continue; + BUG_ON(dma_len < 0); + sg = sg_next(sg); + dma_addr = sg_dma_address(sg); + dma_len = sg_dma_len(sg); + } + + return total_len; +} + +/* NVMe scatterlists require no holes in the virtual address */ +#define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2) ((vec2)->bv_offset || \ + (((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE)) + +static int nvme_map_bio(struct device *dev, struct nvme_iod *iod, + struct bio *bio, enum dma_data_direction dma_dir, int psegs) +{ + struct bio_vec *bvec, *bvprv = NULL; + struct scatterlist *sg = NULL; + int i, old_idx, length = 0, nsegs = 0; + + sg_init_table(iod->sg, psegs); + old_idx = bio->bi_idx; + bio_for_each_segment(bvec, bio, i) { + if (bvprv && BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) { + sg->length += bvec->bv_len; + } else { + if (bvprv && BIOVEC_NOT_VIRT_MERGEABLE(bvprv, bvec)) + break; + sg = sg ? sg + 1 : iod->sg; + sg_set_page(sg, bvec->bv_page, bvec->bv_len, + bvec->bv_offset); + nsegs++; + } + length += bvec->bv_len; + bvprv = bvec; + } + bio->bi_idx = i; + iod->nents = nsegs; + sg_mark_end(sg); + if (dma_map_sg(dev, iod->sg, iod->nents, dma_dir) == 0) { + bio->bi_idx = old_idx; + return -ENOMEM; + } + return length; +} + +/* + * We reuse the small pool to allocate the 16-byte range here as it is not + * worth having a special pool for these or additional cases to handle freeing + * the iod. + */ +static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, + struct bio *bio, struct nvme_iod *iod, int cmdid) +{ + struct nvme_dsm_range *range; + struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; + + range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC, + &iod->first_dma); + if (!range) + return -ENOMEM; + + iod_list(iod)[0] = (__le64 *)range; + iod->npages = 0; + + range->cattr = cpu_to_le32(0); + range->nlb = cpu_to_le32(bio->bi_size >> ns->lba_shift); + range->slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9)); + + memset(cmnd, 0, sizeof(*cmnd)); + cmnd->dsm.opcode = nvme_cmd_dsm; + cmnd->dsm.command_id = cmdid; + cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); + cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma); + cmnd->dsm.nr = 0; + cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); + + if (++nvmeq->sq_tail == nvmeq->q_depth) + nvmeq->sq_tail = 0; + writel(nvmeq->sq_tail, nvmeq->q_db); + + return 0; +} + +static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, + int cmdid) +{ + struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; + + memset(cmnd, 0, sizeof(*cmnd)); + cmnd->common.opcode = nvme_cmd_flush; + cmnd->common.command_id = cmdid; + cmnd->common.nsid = cpu_to_le32(ns->ns_id); + + if (++nvmeq->sq_tail == nvmeq->q_depth) + nvmeq->sq_tail = 0; + writel(nvmeq->sq_tail, nvmeq->q_db); + + return 0; +} + +static int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns) +{ + int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH, + special_completion, NVME_IO_TIMEOUT); + if (unlikely(cmdid < 0)) + return cmdid; + + return nvme_submit_flush(nvmeq, ns, cmdid); +} + +/* + * Called with local interrupts disabled and the q_lock held. May not sleep. + */ +static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, + struct bio *bio) +{ + struct nvme_command *cmnd; + struct nvme_iod *iod; + enum dma_data_direction dma_dir; + int cmdid, length, result = -ENOMEM; + u16 control; + u32 dsmgmt; + int psegs = bio_phys_segments(ns->queue, bio); + + if ((bio->bi_rw & REQ_FLUSH) && psegs) { + result = nvme_submit_flush_data(nvmeq, ns); + if (result) + return result; + } + + iod = nvme_alloc_iod(psegs, bio->bi_size, GFP_ATOMIC); + if (!iod) + goto nomem; + iod->private = bio; + + result = -EBUSY; + cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT); + if (unlikely(cmdid < 0)) + goto free_iod; + + if (bio->bi_rw & REQ_DISCARD) { + result = nvme_submit_discard(nvmeq, ns, bio, iod, cmdid); + if (result) + goto free_cmdid; + return result; + } + if ((bio->bi_rw & REQ_FLUSH) && !psegs) + return nvme_submit_flush(nvmeq, ns, cmdid); + + control = 0; + if (bio->bi_rw & REQ_FUA) + control |= NVME_RW_FUA; + if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD)) + control |= NVME_RW_LR; + + dsmgmt = 0; + if (bio->bi_rw & REQ_RAHEAD) + dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; + + cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; + + memset(cmnd, 0, sizeof(*cmnd)); + if (bio_data_dir(bio)) { + cmnd->rw.opcode = nvme_cmd_write; + dma_dir = DMA_TO_DEVICE; + } else { + cmnd->rw.opcode = nvme_cmd_read; + dma_dir = DMA_FROM_DEVICE; + } + + result = nvme_map_bio(nvmeq->q_dmadev, iod, bio, dma_dir, psegs); + if (result < 0) + goto free_cmdid; + length = result; + + cmnd->rw.command_id = cmdid; + cmnd->rw.nsid = cpu_to_le32(ns->ns_id); + length = nvme_setup_prps(nvmeq->dev, &cmnd->common, iod, length, + GFP_ATOMIC); + cmnd->rw.slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9)); + cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1); + cmnd->rw.control = cpu_to_le16(control); + cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); + + bio->bi_sector += length >> 9; + + if (++nvmeq->sq_tail == nvmeq->q_depth) + nvmeq->sq_tail = 0; + writel(nvmeq->sq_tail, nvmeq->q_db); + + return 0; + + free_cmdid: + free_cmdid(nvmeq, cmdid, NULL); + free_iod: + nvme_free_iod(nvmeq->dev, iod); + nomem: + return result; +} + +static void nvme_make_request(struct request_queue *q, struct bio *bio) +{ + struct nvme_ns *ns = q->queuedata; + struct nvme_queue *nvmeq = get_nvmeq(ns->dev); + int result = -EBUSY; + + spin_lock_irq(&nvmeq->q_lock); + if (bio_list_empty(&nvmeq->sq_cong)) + result = nvme_submit_bio_queue(nvmeq, ns, bio); + if (unlikely(result)) { + if (bio_list_empty(&nvmeq->sq_cong)) + add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); + bio_list_add(&nvmeq->sq_cong, bio); + } + + spin_unlock_irq(&nvmeq->q_lock); + put_nvmeq(nvmeq); +} + +static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq) +{ + u16 head, phase; + + head = nvmeq->cq_head; + phase = nvmeq->cq_phase; + + for (;;) { + void *ctx; + nvme_completion_fn fn; + struct nvme_completion cqe = nvmeq->cqes[head]; + if ((le16_to_cpu(cqe.status) & 1) != phase) + break; + nvmeq->sq_head = le16_to_cpu(cqe.sq_head); + if (++head == nvmeq->q_depth) { + head = 0; + phase = !phase; + } + + ctx = free_cmdid(nvmeq, cqe.command_id, &fn); + fn(nvmeq->dev, ctx, &cqe); + } + + /* If the controller ignores the cq head doorbell and continuously + * writes to the queue, it is theoretically possible to wrap around + * the queue twice and mistakenly return IRQ_NONE. Linux only + * requires that 0.1% of your interrupts are handled, so this isn't + * a big problem. + */ + if (head == nvmeq->cq_head && phase == nvmeq->cq_phase) + return IRQ_NONE; + + writel(head, nvmeq->q_db + (1 << nvmeq->dev->db_stride)); + nvmeq->cq_head = head; + nvmeq->cq_phase = phase; + + return IRQ_HANDLED; +} + +static irqreturn_t nvme_irq(int irq, void *data) +{ + irqreturn_t result; + struct nvme_queue *nvmeq = data; + spin_lock(&nvmeq->q_lock); + result = nvme_process_cq(nvmeq); + spin_unlock(&nvmeq->q_lock); + return result; +} + +static irqreturn_t nvme_irq_check(int irq, void *data) +{ + struct nvme_queue *nvmeq = data; + struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head]; + if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase) + return IRQ_NONE; + return IRQ_WAKE_THREAD; +} + +static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid) +{ + spin_lock_irq(&nvmeq->q_lock); + cancel_cmdid(nvmeq, cmdid, NULL); + spin_unlock_irq(&nvmeq->q_lock); +} + +struct sync_cmd_info { + struct task_struct *task; + u32 result; + int status; +}; + +static void sync_completion(struct nvme_dev *dev, void *ctx, + struct nvme_completion *cqe) +{ + struct sync_cmd_info *cmdinfo = ctx; + cmdinfo->result = le32_to_cpup(&cqe->result); + cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; + wake_up_process(cmdinfo->task); +} + +/* + * Returns 0 on success. If the result is negative, it's a Linux error code; + * if the result is positive, it's an NVM Express status code + */ +static int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, + struct nvme_command *cmd, u32 *result, unsigned timeout) +{ + int cmdid; + struct sync_cmd_info cmdinfo; + + cmdinfo.task = current; + cmdinfo.status = -EINTR; + + cmdid = alloc_cmdid_killable(nvmeq, &cmdinfo, sync_completion, + timeout); + if (cmdid < 0) + return cmdid; + cmd->common.command_id = cmdid; + + set_current_state(TASK_KILLABLE); + nvme_submit_cmd(nvmeq, cmd); + schedule(); + + if (cmdinfo.status == -EINTR) { + nvme_abort_command(nvmeq, cmdid); + return -EINTR; + } + + if (result) + *result = cmdinfo.result; + + return cmdinfo.status; +} + +static int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, + u32 *result) +{ + return nvme_submit_sync_cmd(dev->queues[0], cmd, result, ADMIN_TIMEOUT); +} + +static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) +{ + int status; + struct nvme_command c; + + memset(&c, 0, sizeof(c)); + c.delete_queue.opcode = opcode; + c.delete_queue.qid = cpu_to_le16(id); + + status = nvme_submit_admin_cmd(dev, &c, NULL); + if (status) + return -EIO; + return 0; +} + +static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, + struct nvme_queue *nvmeq) +{ + int status; + struct nvme_command c; + int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; + + memset(&c, 0, sizeof(c)); + c.create_cq.opcode = nvme_admin_create_cq; + c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr); + c.create_cq.cqid = cpu_to_le16(qid); + c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); + c.create_cq.cq_flags = cpu_to_le16(flags); + c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); + + status = nvme_submit_admin_cmd(dev, &c, NULL); + if (status) + return -EIO; + return 0; +} + +static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, + struct nvme_queue *nvmeq) +{ + int status; + struct nvme_command c; + int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM; + + memset(&c, 0, sizeof(c)); + c.create_sq.opcode = nvme_admin_create_sq; + c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr); + c.create_sq.sqid = cpu_to_le16(qid); + c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1); + c.create_sq.sq_flags = cpu_to_le16(flags); + c.create_sq.cqid = cpu_to_le16(qid); + + status = nvme_submit_admin_cmd(dev, &c, NULL); + if (status) + return -EIO; + return 0; +} + +static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) +{ + return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid); +} + +static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) +{ + return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); +} + +static int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns, + dma_addr_t dma_addr) +{ + struct nvme_command c; + + memset(&c, 0, sizeof(c)); + c.identify.opcode = nvme_admin_identify; + c.identify.nsid = cpu_to_le32(nsid); + c.identify.prp1 = cpu_to_le64(dma_addr); + c.identify.cns = cpu_to_le32(cns); + + return nvme_submit_admin_cmd(dev, &c, NULL); +} + +static int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, + dma_addr_t dma_addr, u32 *result) +{ + struct nvme_command c; + + memset(&c, 0, sizeof(c)); + c.features.opcode = nvme_admin_get_features; + c.features.nsid = cpu_to_le32(nsid); + c.features.prp1 = cpu_to_le64(dma_addr); + c.features.fid = cpu_to_le32(fid); + + return nvme_submit_admin_cmd(dev, &c, result); +} + +static int nvme_set_features(struct nvme_dev *dev, unsigned fid, + unsigned dword11, dma_addr_t dma_addr, u32 *result) +{ + struct nvme_command c; + + memset(&c, 0, sizeof(c)); + c.features.opcode = nvme_admin_set_features; + c.features.prp1 = cpu_to_le64(dma_addr); + c.features.fid = cpu_to_le32(fid); + c.features.dword11 = cpu_to_le32(dword11); + + return nvme_submit_admin_cmd(dev, &c, result); +} + +/** + * nvme_cancel_ios - Cancel outstanding I/Os + * @queue: The queue to cancel I/Os on + * @timeout: True to only cancel I/Os which have timed out + */ +static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout) +{ + int depth = nvmeq->q_depth - 1; + struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); + unsigned long now = jiffies; + int cmdid; + + for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) { + void *ctx; + nvme_completion_fn fn; + static struct nvme_completion cqe = { + .status = cpu_to_le16(NVME_SC_ABORT_REQ) << 1, + }; + + if (timeout && !time_after(now, info[cmdid].timeout)) + continue; + dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d\n", cmdid); + ctx = cancel_cmdid(nvmeq, cmdid, &fn); + fn(nvmeq->dev, ctx, &cqe); + } +} + +static void nvme_free_queue_mem(struct nvme_queue *nvmeq) +{ + dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), + (void *)nvmeq->cqes, nvmeq->cq_dma_addr); + dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), + nvmeq->sq_cmds, nvmeq->sq_dma_addr); + kfree(nvmeq); +} + +static void nvme_free_queue(struct nvme_dev *dev, int qid) +{ + struct nvme_queue *nvmeq = dev->queues[qid]; + int vector = dev->entry[nvmeq->cq_vector].vector; + + spin_lock_irq(&nvmeq->q_lock); + nvme_cancel_ios(nvmeq, false); + while (bio_list_peek(&nvmeq->sq_cong)) { + struct bio *bio = bio_list_pop(&nvmeq->sq_cong); + bio_endio(bio, -EIO); + } + spin_unlock_irq(&nvmeq->q_lock); + + irq_set_affinity_hint(vector, NULL); + free_irq(vector, nvmeq); + + /* Don't tell the adapter to delete the admin queue */ + if (qid) { + adapter_delete_sq(dev, qid); + adapter_delete_cq(dev, qid); + } + + nvme_free_queue_mem(nvmeq); +} + +static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, + int depth, int vector) +{ + struct device *dmadev = &dev->pci_dev->dev; + unsigned extra = DIV_ROUND_UP(depth, 8) + (depth * + sizeof(struct nvme_cmd_info)); + struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL); + if (!nvmeq) + return NULL; + + nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth), + &nvmeq->cq_dma_addr, GFP_KERNEL); + if (!nvmeq->cqes) + goto free_nvmeq; + memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth)); + + nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth), + &nvmeq->sq_dma_addr, GFP_KERNEL); + if (!nvmeq->sq_cmds) + goto free_cqdma; + + nvmeq->q_dmadev = dmadev; + nvmeq->dev = dev; + spin_lock_init(&nvmeq->q_lock); + nvmeq->cq_head = 0; + nvmeq->cq_phase = 1; + init_waitqueue_head(&nvmeq->sq_full); + init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread); + bio_list_init(&nvmeq->sq_cong); + nvmeq->q_db = &dev->dbs[qid << (dev->db_stride + 1)]; + nvmeq->q_depth = depth; + nvmeq->cq_vector = vector; + + return nvmeq; + + free_cqdma: + dma_free_coherent(dmadev, CQ_SIZE(nvmeq->q_depth), (void *)nvmeq->cqes, + nvmeq->cq_dma_addr); + free_nvmeq: + kfree(nvmeq); + return NULL; +} + +static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq, + const char *name) +{ + if (use_threaded_interrupts) + return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector, + nvme_irq_check, nvme_irq, + IRQF_DISABLED | IRQF_SHARED, + name, nvmeq); + return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq, + IRQF_DISABLED | IRQF_SHARED, name, nvmeq); +} + +static struct nvme_queue *nvme_create_queue(struct nvme_dev *dev, int qid, + int cq_size, int vector) +{ + int result; + struct nvme_queue *nvmeq = nvme_alloc_queue(dev, qid, cq_size, vector); + + if (!nvmeq) + return ERR_PTR(-ENOMEM); + + result = adapter_alloc_cq(dev, qid, nvmeq); + if (result < 0) + goto free_nvmeq; + + result = adapter_alloc_sq(dev, qid, nvmeq); + if (result < 0) + goto release_cq; + + result = queue_request_irq(dev, nvmeq, "nvme"); + if (result < 0) + goto release_sq; + + return nvmeq; + + release_sq: + adapter_delete_sq(dev, qid); + release_cq: + adapter_delete_cq(dev, qid); + free_nvmeq: + dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), + (void *)nvmeq->cqes, nvmeq->cq_dma_addr); + dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), + nvmeq->sq_cmds, nvmeq->sq_dma_addr); + kfree(nvmeq); + return ERR_PTR(result); +} + +static int nvme_configure_admin_queue(struct nvme_dev *dev) +{ + int result = 0; + u32 aqa; + u64 cap; + unsigned long timeout; + struct nvme_queue *nvmeq; + + dev->dbs = ((void __iomem *)dev->bar) + 4096; + + nvmeq = nvme_alloc_queue(dev, 0, 64, 0); + if (!nvmeq) + return -ENOMEM; + + aqa = nvmeq->q_depth - 1; + aqa |= aqa << 16; + + dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM; + dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT; + dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; + dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; + + writel(0, &dev->bar->cc); + writel(aqa, &dev->bar->aqa); + writeq(nvmeq->sq_dma_addr, &dev->bar->asq); + writeq(nvmeq->cq_dma_addr, &dev->bar->acq); + writel(dev->ctrl_config, &dev->bar->cc); + + cap = readq(&dev->bar->cap); + timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; + dev->db_stride = NVME_CAP_STRIDE(cap); + + while (!result && !(readl(&dev->bar->csts) & NVME_CSTS_RDY)) { + msleep(100); + if (fatal_signal_pending(current)) + result = -EINTR; + if (time_after(jiffies, timeout)) { + dev_err(&dev->pci_dev->dev, + "Device not ready; aborting initialisation\n"); + result = -ENODEV; + } + } + + if (result) { + nvme_free_queue_mem(nvmeq); + return result; + } + + result = queue_request_irq(dev, nvmeq, "nvme admin"); + dev->queues[0] = nvmeq; + return result; +} + +static struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, + unsigned long addr, unsigned length) +{ + int i, err, count, nents, offset; + struct scatterlist *sg; + struct page **pages; + struct nvme_iod *iod; + + if (addr & 3) + return ERR_PTR(-EINVAL); + if (!length) + return ERR_PTR(-EINVAL); + + offset = offset_in_page(addr); + count = DIV_ROUND_UP(offset + length, PAGE_SIZE); + pages = kcalloc(count, sizeof(*pages), GFP_KERNEL); + if (!pages) + return ERR_PTR(-ENOMEM); + + err = get_user_pages_fast(addr, count, 1, pages); + if (err < count) { + count = err; + err = -EFAULT; + goto put_pages; + } + + iod = nvme_alloc_iod(count, length, GFP_KERNEL); + sg = iod->sg; + sg_init_table(sg, count); + for (i = 0; i < count; i++) { + sg_set_page(&sg[i], pages[i], + min_t(int, length, PAGE_SIZE - offset), offset); + length -= (PAGE_SIZE - offset); + offset = 0; + } + sg_mark_end(&sg[i - 1]); + iod->nents = count; + + err = -ENOMEM; + nents = dma_map_sg(&dev->pci_dev->dev, sg, count, + write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + if (!nents) + goto free_iod; + + kfree(pages); + return iod; + + free_iod: + kfree(iod); + put_pages: + for (i = 0; i < count; i++) + put_page(pages[i]); + kfree(pages); + return ERR_PTR(err); +} + +static void nvme_unmap_user_pages(struct nvme_dev *dev, int write, + struct nvme_iod *iod) +{ + int i; + + dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, + write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + + for (i = 0; i < iod->nents; i++) + put_page(sg_page(&iod->sg[i])); +} + +static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) +{ + struct nvme_dev *dev = ns->dev; + struct nvme_queue *nvmeq; + struct nvme_user_io io; + struct nvme_command c; + unsigned length; + int status; + struct nvme_iod *iod; + + if (copy_from_user(&io, uio, sizeof(io))) + return -EFAULT; + length = (io.nblocks + 1) << ns->lba_shift; + + switch (io.opcode) { + case nvme_cmd_write: + case nvme_cmd_read: + case nvme_cmd_compare: + iod = nvme_map_user_pages(dev, io.opcode & 1, io.addr, length); + break; + default: + return -EINVAL; + } + + if (IS_ERR(iod)) + return PTR_ERR(iod); + + memset(&c, 0, sizeof(c)); + c.rw.opcode = io.opcode; + c.rw.flags = io.flags; + c.rw.nsid = cpu_to_le32(ns->ns_id); + c.rw.slba = cpu_to_le64(io.slba); + c.rw.length = cpu_to_le16(io.nblocks); + c.rw.control = cpu_to_le16(io.control); + c.rw.dsmgmt = cpu_to_le16(io.dsmgmt); + c.rw.reftag = io.reftag; + c.rw.apptag = io.apptag; + c.rw.appmask = io.appmask; + /* XXX: metadata */ + length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL); + + nvmeq = get_nvmeq(dev); + /* + * Since nvme_submit_sync_cmd sleeps, we can't keep preemption + * disabled. We may be preempted at any point, and be rescheduled + * to a different CPU. That will cause cacheline bouncing, but no + * additional races since q_lock already protects against other CPUs. + */ + put_nvmeq(nvmeq); + if (length != (io.nblocks + 1) << ns->lba_shift) + status = -ENOMEM; + else + status = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT); + + nvme_unmap_user_pages(dev, io.opcode & 1, iod); + nvme_free_iod(dev, iod); + return status; +} + +static int nvme_user_admin_cmd(struct nvme_dev *dev, + struct nvme_admin_cmd __user *ucmd) +{ + struct nvme_admin_cmd cmd; + struct nvme_command c; + int status, length; + struct nvme_iod *uninitialized_var(iod); + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (copy_from_user(&cmd, ucmd, sizeof(cmd))) + return -EFAULT; + + memset(&c, 0, sizeof(c)); + c.common.opcode = cmd.opcode; + c.common.flags = cmd.flags; + c.common.nsid = cpu_to_le32(cmd.nsid); + c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); + c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); + c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); + c.common.cdw10[1] = cpu_to_le32(cmd.cdw11); + c.common.cdw10[2] = cpu_to_le32(cmd.cdw12); + c.common.cdw10[3] = cpu_to_le32(cmd.cdw13); + c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); + c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); + + length = cmd.data_len; + if (cmd.data_len) { + iod = nvme_map_user_pages(dev, cmd.opcode & 1, cmd.addr, + length); + if (IS_ERR(iod)) + return PTR_ERR(iod); + length = nvme_setup_prps(dev, &c.common, iod, length, + GFP_KERNEL); + } + + if (length != cmd.data_len) + status = -ENOMEM; + else + status = nvme_submit_admin_cmd(dev, &c, &cmd.result); + + if (cmd.data_len) { + nvme_unmap_user_pages(dev, cmd.opcode & 1, iod); + nvme_free_iod(dev, iod); + } + + if (!status && copy_to_user(&ucmd->result, &cmd.result, + sizeof(cmd.result))) + status = -EFAULT; + + return status; +} + +static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, + unsigned long arg) +{ + struct nvme_ns *ns = bdev->bd_disk->private_data; + + switch (cmd) { + case NVME_IOCTL_ID: + return ns->ns_id; + case NVME_IOCTL_ADMIN_CMD: + return nvme_user_admin_cmd(ns->dev, (void __user *)arg); + case NVME_IOCTL_SUBMIT_IO: + return nvme_submit_io(ns, (void __user *)arg); + default: + return -ENOTTY; + } +} + +static const struct block_device_operations nvme_fops = { + .owner = THIS_MODULE, + .ioctl = nvme_ioctl, + .compat_ioctl = nvme_ioctl, +}; + +static void nvme_resubmit_bios(struct nvme_queue *nvmeq) +{ + while (bio_list_peek(&nvmeq->sq_cong)) { + struct bio *bio = bio_list_pop(&nvmeq->sq_cong); + struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data; + if (nvme_submit_bio_queue(nvmeq, ns, bio)) { + bio_list_add_head(&nvmeq->sq_cong, bio); + break; + } + if (bio_list_empty(&nvmeq->sq_cong)) + remove_wait_queue(&nvmeq->sq_full, + &nvmeq->sq_cong_wait); + } +} + +static int nvme_kthread(void *data) +{ + struct nvme_dev *dev; + + while (!kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + spin_lock(&dev_list_lock); + list_for_each_entry(dev, &dev_list, node) { + int i; + for (i = 0; i < dev->queue_count; i++) { + struct nvme_queue *nvmeq = dev->queues[i]; + if (!nvmeq) + continue; + spin_lock_irq(&nvmeq->q_lock); + if (nvme_process_cq(nvmeq)) + printk("process_cq did something\n"); + nvme_cancel_ios(nvmeq, true); + nvme_resubmit_bios(nvmeq); + spin_unlock_irq(&nvmeq->q_lock); + } + } + spin_unlock(&dev_list_lock); + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ); + } + return 0; +} + +static DEFINE_IDA(nvme_index_ida); + +static int nvme_get_ns_idx(void) +{ + int index, error; + + do { + if (!ida_pre_get(&nvme_index_ida, GFP_KERNEL)) + return -1; + + spin_lock(&dev_list_lock); + error = ida_get_new(&nvme_index_ida, &index); + spin_unlock(&dev_list_lock); + } while (error == -EAGAIN); + + if (error) + index = -1; + return index; +} + +static void nvme_put_ns_idx(int index) +{ + spin_lock(&dev_list_lock); + ida_remove(&nvme_index_ida, index); + spin_unlock(&dev_list_lock); +} + +static void nvme_config_discard(struct nvme_ns *ns) +{ + u32 logical_block_size = queue_logical_block_size(ns->queue); + ns->queue->limits.discard_zeroes_data = 0; + ns->queue->limits.discard_alignment = logical_block_size; + ns->queue->limits.discard_granularity = logical_block_size; + ns->queue->limits.max_discard_sectors = 0xffffffff; + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); +} + +static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid, + struct nvme_id_ns *id, struct nvme_lba_range_type *rt) +{ + struct nvme_ns *ns; + struct gendisk *disk; + int lbaf; + + if (rt->attributes & NVME_LBART_ATTRIB_HIDE) + return NULL; + + ns = kzalloc(sizeof(*ns), GFP_KERNEL); + if (!ns) + return NULL; + ns->queue = blk_alloc_queue(GFP_KERNEL); + if (!ns->queue) + goto out_free_ns; + ns->queue->queue_flags = QUEUE_FLAG_DEFAULT; + queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); + blk_queue_make_request(ns->queue, nvme_make_request); + ns->dev = dev; + ns->queue->queuedata = ns; + + disk = alloc_disk(NVME_MINORS); + if (!disk) + goto out_free_queue; + ns->ns_id = nsid; + ns->disk = disk; + lbaf = id->flbas & 0xf; + ns->lba_shift = id->lbaf[lbaf].ds; + blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); + if (dev->max_hw_sectors) + blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); + + disk->major = nvme_major; + disk->minors = NVME_MINORS; + disk->first_minor = NVME_MINORS * nvme_get_ns_idx(); + disk->fops = &nvme_fops; + disk->private_data = ns; + disk->queue = ns->queue; + disk->driverfs_dev = &dev->pci_dev->dev; + sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid); + set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); + + if (dev->oncs & NVME_CTRL_ONCS_DSM) + nvme_config_discard(ns); + + return ns; + + out_free_queue: + blk_cleanup_queue(ns->queue); + out_free_ns: + kfree(ns); + return NULL; +} + +static void nvme_ns_free(struct nvme_ns *ns) +{ + int index = ns->disk->first_minor / NVME_MINORS; + put_disk(ns->disk); + nvme_put_ns_idx(index); + blk_cleanup_queue(ns->queue); + kfree(ns); +} + +static int set_queue_count(struct nvme_dev *dev, int count) +{ + int status; + u32 result; + u32 q_count = (count - 1) | ((count - 1) << 16); + + status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0, + &result); + if (status) + return -EIO; + return min(result & 0xffff, result >> 16) + 1; +} + +static int nvme_setup_io_queues(struct nvme_dev *dev) +{ + int result, cpu, i, nr_io_queues, db_bar_size, q_depth; + + nr_io_queues = num_online_cpus(); + result = set_queue_count(dev, nr_io_queues); + if (result < 0) + return result; + if (result < nr_io_queues) + nr_io_queues = result; + + /* Deregister the admin queue's interrupt */ + free_irq(dev->entry[0].vector, dev->queues[0]); + + db_bar_size = 4096 + ((nr_io_queues + 1) << (dev->db_stride + 3)); + if (db_bar_size > 8192) { + iounmap(dev->bar); + dev->bar = ioremap(pci_resource_start(dev->pci_dev, 0), + db_bar_size); + dev->dbs = ((void __iomem *)dev->bar) + 4096; + dev->queues[0]->q_db = dev->dbs; + } + + for (i = 0; i < nr_io_queues; i++) + dev->entry[i].entry = i; + for (;;) { + result = pci_enable_msix(dev->pci_dev, dev->entry, + nr_io_queues); + if (result == 0) { + break; + } else if (result > 0) { + nr_io_queues = result; + continue; + } else { + nr_io_queues = 1; + break; + } + } + + result = queue_request_irq(dev, dev->queues[0], "nvme admin"); + /* XXX: handle failure here */ + + cpu = cpumask_first(cpu_online_mask); + for (i = 0; i < nr_io_queues; i++) { + irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu)); + cpu = cpumask_next(cpu, cpu_online_mask); + } + + q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1, + NVME_Q_DEPTH); + for (i = 0; i < nr_io_queues; i++) { + dev->queues[i + 1] = nvme_create_queue(dev, i + 1, q_depth, i); + if (IS_ERR(dev->queues[i + 1])) + return PTR_ERR(dev->queues[i + 1]); + dev->queue_count++; + } + + for (; i < num_possible_cpus(); i++) { + int target = i % rounddown_pow_of_two(dev->queue_count - 1); + dev->queues[i + 1] = dev->queues[target + 1]; + } + + return 0; +} + +static void nvme_free_queues(struct nvme_dev *dev) +{ + int i; + + for (i = dev->queue_count - 1; i >= 0; i--) + nvme_free_queue(dev, i); +} + +static int nvme_dev_add(struct nvme_dev *dev) +{ + int res, nn, i; + struct nvme_ns *ns, *next; + struct nvme_id_ctrl *ctrl; + struct nvme_id_ns *id_ns; + void *mem; + dma_addr_t dma_addr; + + res = nvme_setup_io_queues(dev); + if (res) + return res; + + mem = dma_alloc_coherent(&dev->pci_dev->dev, 8192, &dma_addr, + GFP_KERNEL); + + res = nvme_identify(dev, 0, 1, dma_addr); + if (res) { + res = -EIO; + goto out_free; + } + + ctrl = mem; + nn = le32_to_cpup(&ctrl->nn); + dev->oncs = le16_to_cpup(&ctrl->oncs); + memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); + memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); + memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); + if (ctrl->mdts) { + int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12; + dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9); + } + + id_ns = mem; + for (i = 1; i <= nn; i++) { + res = nvme_identify(dev, i, 0, dma_addr); + if (res) + continue; + + if (id_ns->ncap == 0) + continue; + + res = nvme_get_features(dev, NVME_FEAT_LBA_RANGE, i, + dma_addr + 4096, NULL); + if (res) + memset(mem + 4096, 0, 4096); + + ns = nvme_alloc_ns(dev, i, mem, mem + 4096); + if (ns) + list_add_tail(&ns->list, &dev->namespaces); + } + list_for_each_entry(ns, &dev->namespaces, list) + add_disk(ns->disk); + + goto out; + + out_free: + list_for_each_entry_safe(ns, next, &dev->namespaces, list) { + list_del(&ns->list); + nvme_ns_free(ns); + } + + out: + dma_free_coherent(&dev->pci_dev->dev, 8192, mem, dma_addr); + return res; +} + +static int nvme_dev_remove(struct nvme_dev *dev) +{ + struct nvme_ns *ns, *next; + + spin_lock(&dev_list_lock); + list_del(&dev->node); + spin_unlock(&dev_list_lock); + + list_for_each_entry_safe(ns, next, &dev->namespaces, list) { + list_del(&ns->list); + del_gendisk(ns->disk); + nvme_ns_free(ns); + } + + nvme_free_queues(dev); + + return 0; +} + +static int nvme_setup_prp_pools(struct nvme_dev *dev) +{ + struct device *dmadev = &dev->pci_dev->dev; + dev->prp_page_pool = dma_pool_create("prp list page", dmadev, + PAGE_SIZE, PAGE_SIZE, 0); + if (!dev->prp_page_pool) + return -ENOMEM; + + /* Optimisation for I/Os between 4k and 128k */ + dev->prp_small_pool = dma_pool_create("prp list 256", dmadev, + 256, 256, 0); + if (!dev->prp_small_pool) { + dma_pool_destroy(dev->prp_page_pool); + return -ENOMEM; + } + return 0; +} + +static void nvme_release_prp_pools(struct nvme_dev *dev) +{ + dma_pool_destroy(dev->prp_page_pool); + dma_pool_destroy(dev->prp_small_pool); +} + +static DEFINE_IDA(nvme_instance_ida); + +static int nvme_set_instance(struct nvme_dev *dev) +{ + int instance, error; + + do { + if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL)) + return -ENODEV; + + spin_lock(&dev_list_lock); + error = ida_get_new(&nvme_instance_ida, &instance); + spin_unlock(&dev_list_lock); + } while (error == -EAGAIN); + + if (error) + return -ENODEV; + + dev->instance = instance; + return 0; +} + +static void nvme_release_instance(struct nvme_dev *dev) +{ + spin_lock(&dev_list_lock); + ida_remove(&nvme_instance_ida, dev->instance); + spin_unlock(&dev_list_lock); +} + +static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + int bars, result = -ENOMEM; + struct nvme_dev *dev; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return -ENOMEM; + dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry), + GFP_KERNEL); + if (!dev->entry) + goto free; + dev->queues = kcalloc(num_possible_cpus() + 1, sizeof(void *), + GFP_KERNEL); + if (!dev->queues) + goto free; + + if (pci_enable_device_mem(pdev)) + goto free; + pci_set_master(pdev); + bars = pci_select_bars(pdev, IORESOURCE_MEM); + if (pci_request_selected_regions(pdev, bars, "nvme")) + goto disable; + + INIT_LIST_HEAD(&dev->namespaces); + dev->pci_dev = pdev; + pci_set_drvdata(pdev, dev); + dma_set_mask(&pdev->dev, DMA_BIT_MASK(64)); + dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64)); + result = nvme_set_instance(dev); + if (result) + goto disable; + + dev->entry[0].vector = pdev->irq; + + result = nvme_setup_prp_pools(dev); + if (result) + goto disable_msix; + + dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); + if (!dev->bar) { + result = -ENOMEM; + goto disable_msix; + } + + result = nvme_configure_admin_queue(dev); + if (result) + goto unmap; + dev->queue_count++; + + spin_lock(&dev_list_lock); + list_add(&dev->node, &dev_list); + spin_unlock(&dev_list_lock); + + result = nvme_dev_add(dev); + if (result) + goto delete; + + return 0; + + delete: + spin_lock(&dev_list_lock); + list_del(&dev->node); + spin_unlock(&dev_list_lock); + + nvme_free_queues(dev); + unmap: + iounmap(dev->bar); + disable_msix: + pci_disable_msix(pdev); + nvme_release_instance(dev); + nvme_release_prp_pools(dev); + disable: + pci_disable_device(pdev); + pci_release_regions(pdev); + free: + kfree(dev->queues); + kfree(dev->entry); + kfree(dev); + return result; +} + +static void nvme_remove(struct pci_dev *pdev) +{ + struct nvme_dev *dev = pci_get_drvdata(pdev); + nvme_dev_remove(dev); + pci_disable_msix(pdev); + iounmap(dev->bar); + nvme_release_instance(dev); + nvme_release_prp_pools(dev); + pci_disable_device(pdev); + pci_release_regions(pdev); + kfree(dev->queues); + kfree(dev->entry); + kfree(dev); +} + +/* These functions are yet to be implemented */ +#define nvme_error_detected NULL +#define nvme_dump_registers NULL +#define nvme_link_reset NULL +#define nvme_slot_reset NULL +#define nvme_error_resume NULL +#define nvme_suspend NULL +#define nvme_resume NULL + +static const struct pci_error_handlers nvme_err_handler = { + .error_detected = nvme_error_detected, + .mmio_enabled = nvme_dump_registers, + .link_reset = nvme_link_reset, + .slot_reset = nvme_slot_reset, + .resume = nvme_error_resume, +}; + +/* Move to pci_ids.h later */ +#define PCI_CLASS_STORAGE_EXPRESS 0x010802 + +static DEFINE_PCI_DEVICE_TABLE(nvme_id_table) = { + { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, + { 0, } +}; +MODULE_DEVICE_TABLE(pci, nvme_id_table); + +static struct pci_driver nvme_driver = { + .name = "nvme", + .id_table = nvme_id_table, + .probe = nvme_probe, + .remove = nvme_remove, + .suspend = nvme_suspend, + .resume = nvme_resume, + .err_handler = &nvme_err_handler, +}; + +static int __init nvme_init(void) +{ + int result; + + nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); + if (IS_ERR(nvme_thread)) + return PTR_ERR(nvme_thread); + + result = register_blkdev(nvme_major, "nvme"); + if (result < 0) + goto kill_kthread; + else if (result > 0) + nvme_major = result; + + result = pci_register_driver(&nvme_driver); + if (result) + goto unregister_blkdev; + return 0; + + unregister_blkdev: + unregister_blkdev(nvme_major, "nvme"); + kill_kthread: + kthread_stop(nvme_thread); + return result; +} + +static void __exit nvme_exit(void) +{ + pci_unregister_driver(&nvme_driver); + unregister_blkdev(nvme_major, "nvme"); + kthread_stop(nvme_thread); +} + +MODULE_AUTHOR("Matthew Wilcox "); +MODULE_LICENSE("GPL"); +MODULE_VERSION("0.8"); +module_init(nvme_init); +module_exit(nvme_exit); diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c deleted file mode 100644 index 26e26607207..00000000000 --- a/drivers/block/nvme.c +++ /dev/null @@ -1,1865 +0,0 @@ -/* - * NVM Express device driver - * Copyright (c) 2011, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#define NVME_Q_DEPTH 1024 -#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) -#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) -#define NVME_MINORS 64 -#define NVME_IO_TIMEOUT (5 * HZ) -#define ADMIN_TIMEOUT (60 * HZ) - -static int nvme_major; -module_param(nvme_major, int, 0); - -static int use_threaded_interrupts; -module_param(use_threaded_interrupts, int, 0); - -static DEFINE_SPINLOCK(dev_list_lock); -static LIST_HEAD(dev_list); -static struct task_struct *nvme_thread; - -/* - * Represents an NVM Express device. Each nvme_dev is a PCI function. - */ -struct nvme_dev { - struct list_head node; - struct nvme_queue **queues; - u32 __iomem *dbs; - struct pci_dev *pci_dev; - struct dma_pool *prp_page_pool; - struct dma_pool *prp_small_pool; - int instance; - int queue_count; - int db_stride; - u32 ctrl_config; - struct msix_entry *entry; - struct nvme_bar __iomem *bar; - struct list_head namespaces; - char serial[20]; - char model[40]; - char firmware_rev[8]; - u32 max_hw_sectors; - u16 oncs; -}; - -/* - * An NVM Express namespace is equivalent to a SCSI LUN - */ -struct nvme_ns { - struct list_head list; - - struct nvme_dev *dev; - struct request_queue *queue; - struct gendisk *disk; - - int ns_id; - int lba_shift; -}; - -/* - * An NVM Express queue. Each device has at least two (one for admin - * commands and one for I/O commands). - */ -struct nvme_queue { - struct device *q_dmadev; - struct nvme_dev *dev; - spinlock_t q_lock; - struct nvme_command *sq_cmds; - volatile struct nvme_completion *cqes; - dma_addr_t sq_dma_addr; - dma_addr_t cq_dma_addr; - wait_queue_head_t sq_full; - wait_queue_t sq_cong_wait; - struct bio_list sq_cong; - u32 __iomem *q_db; - u16 q_depth; - u16 cq_vector; - u16 sq_head; - u16 sq_tail; - u16 cq_head; - u16 cq_phase; - unsigned long cmdid_data[]; -}; - -/* - * Check we didin't inadvertently grow the command struct - */ -static inline void _nvme_check_size(void) -{ - BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64); - BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64); - BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); - BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); - BUILD_BUG_ON(sizeof(struct nvme_features) != 64); - BUILD_BUG_ON(sizeof(struct nvme_command) != 64); - BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); - BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); - BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); - BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); -} - -typedef void (*nvme_completion_fn)(struct nvme_dev *, void *, - struct nvme_completion *); - -struct nvme_cmd_info { - nvme_completion_fn fn; - void *ctx; - unsigned long timeout; -}; - -static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq) -{ - return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)]; -} - -/** - * alloc_cmdid() - Allocate a Command ID - * @nvmeq: The queue that will be used for this command - * @ctx: A pointer that will be passed to the handler - * @handler: The function to call on completion - * - * Allocate a Command ID for a queue. The data passed in will - * be passed to the completion handler. This is implemented by using - * the bottom two bits of the ctx pointer to store the handler ID. - * Passing in a pointer that's not 4-byte aligned will cause a BUG. - * We can change this if it becomes a problem. - * - * May be called with local interrupts disabled and the q_lock held, - * or with interrupts enabled and no locks held. - */ -static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, - nvme_completion_fn handler, unsigned timeout) -{ - int depth = nvmeq->q_depth - 1; - struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); - int cmdid; - - do { - cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth); - if (cmdid >= depth) - return -EBUSY; - } while (test_and_set_bit(cmdid, nvmeq->cmdid_data)); - - info[cmdid].fn = handler; - info[cmdid].ctx = ctx; - info[cmdid].timeout = jiffies + timeout; - return cmdid; -} - -static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx, - nvme_completion_fn handler, unsigned timeout) -{ - int cmdid; - wait_event_killable(nvmeq->sq_full, - (cmdid = alloc_cmdid(nvmeq, ctx, handler, timeout)) >= 0); - return (cmdid < 0) ? -EINTR : cmdid; -} - -/* Special values must be less than 0x1000 */ -#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA) -#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) -#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) -#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) -#define CMD_CTX_FLUSH (0x318 + CMD_CTX_BASE) - -static void special_completion(struct nvme_dev *dev, void *ctx, - struct nvme_completion *cqe) -{ - if (ctx == CMD_CTX_CANCELLED) - return; - if (ctx == CMD_CTX_FLUSH) - return; - if (ctx == CMD_CTX_COMPLETED) { - dev_warn(&dev->pci_dev->dev, - "completed id %d twice on queue %d\n", - cqe->command_id, le16_to_cpup(&cqe->sq_id)); - return; - } - if (ctx == CMD_CTX_INVALID) { - dev_warn(&dev->pci_dev->dev, - "invalid id %d completed on queue %d\n", - cqe->command_id, le16_to_cpup(&cqe->sq_id)); - return; - } - - dev_warn(&dev->pci_dev->dev, "Unknown special completion %p\n", ctx); -} - -/* - * Called with local interrupts disabled and the q_lock held. May not sleep. - */ -static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid, - nvme_completion_fn *fn) -{ - void *ctx; - struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); - - if (cmdid >= nvmeq->q_depth) { - *fn = special_completion; - return CMD_CTX_INVALID; - } - if (fn) - *fn = info[cmdid].fn; - ctx = info[cmdid].ctx; - info[cmdid].fn = special_completion; - info[cmdid].ctx = CMD_CTX_COMPLETED; - clear_bit(cmdid, nvmeq->cmdid_data); - wake_up(&nvmeq->sq_full); - return ctx; -} - -static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid, - nvme_completion_fn *fn) -{ - void *ctx; - struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); - if (fn) - *fn = info[cmdid].fn; - ctx = info[cmdid].ctx; - info[cmdid].fn = special_completion; - info[cmdid].ctx = CMD_CTX_CANCELLED; - return ctx; -} - -static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) -{ - return dev->queues[get_cpu() + 1]; -} - -static void put_nvmeq(struct nvme_queue *nvmeq) -{ - put_cpu(); -} - -/** - * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell - * @nvmeq: The queue to use - * @cmd: The command to send - * - * Safe to use from interrupt context - */ -static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) -{ - unsigned long flags; - u16 tail; - spin_lock_irqsave(&nvmeq->q_lock, flags); - tail = nvmeq->sq_tail; - memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); - if (++tail == nvmeq->q_depth) - tail = 0; - writel(tail, nvmeq->q_db); - nvmeq->sq_tail = tail; - spin_unlock_irqrestore(&nvmeq->q_lock, flags); - - return 0; -} - -/* - * The nvme_iod describes the data in an I/O, including the list of PRP - * entries. You can't see it in this data structure because C doesn't let - * me express that. Use nvme_alloc_iod to ensure there's enough space - * allocated to store the PRP list. - */ -struct nvme_iod { - void *private; /* For the use of the submitter of the I/O */ - int npages; /* In the PRP list. 0 means small pool in use */ - int offset; /* Of PRP list */ - int nents; /* Used in scatterlist */ - int length; /* Of data, in bytes */ - dma_addr_t first_dma; - struct scatterlist sg[0]; -}; - -static __le64 **iod_list(struct nvme_iod *iod) -{ - return ((void *)iod) + iod->offset; -} - -/* - * Will slightly overestimate the number of pages needed. This is OK - * as it only leads to a small amount of wasted memory for the lifetime of - * the I/O. - */ -static int nvme_npages(unsigned size) -{ - unsigned nprps = DIV_ROUND_UP(size + PAGE_SIZE, PAGE_SIZE); - return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); -} - -static struct nvme_iod * -nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp) -{ - struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + - sizeof(__le64 *) * nvme_npages(nbytes) + - sizeof(struct scatterlist) * nseg, gfp); - - if (iod) { - iod->offset = offsetof(struct nvme_iod, sg[nseg]); - iod->npages = -1; - iod->length = nbytes; - iod->nents = 0; - } - - return iod; -} - -static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) -{ - const int last_prp = PAGE_SIZE / 8 - 1; - int i; - __le64 **list = iod_list(iod); - dma_addr_t prp_dma = iod->first_dma; - - if (iod->npages == 0) - dma_pool_free(dev->prp_small_pool, list[0], prp_dma); - for (i = 0; i < iod->npages; i++) { - __le64 *prp_list = list[i]; - dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]); - dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); - prp_dma = next_prp_dma; - } - kfree(iod); -} - -static void requeue_bio(struct nvme_dev *dev, struct bio *bio) -{ - struct nvme_queue *nvmeq = get_nvmeq(dev); - if (bio_list_empty(&nvmeq->sq_cong)) - add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); - bio_list_add(&nvmeq->sq_cong, bio); - put_nvmeq(nvmeq); - wake_up_process(nvme_thread); -} - -static void bio_completion(struct nvme_dev *dev, void *ctx, - struct nvme_completion *cqe) -{ - struct nvme_iod *iod = ctx; - struct bio *bio = iod->private; - u16 status = le16_to_cpup(&cqe->status) >> 1; - - if (iod->nents) - dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, - bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - nvme_free_iod(dev, iod); - if (status) { - bio_endio(bio, -EIO); - } else if (bio->bi_vcnt > bio->bi_idx) { - requeue_bio(dev, bio); - } else { - bio_endio(bio, 0); - } -} - -/* length is in bytes. gfp flags indicates whether we may sleep. */ -static int nvme_setup_prps(struct nvme_dev *dev, - struct nvme_common_command *cmd, struct nvme_iod *iod, - int total_len, gfp_t gfp) -{ - struct dma_pool *pool; - int length = total_len; - struct scatterlist *sg = iod->sg; - int dma_len = sg_dma_len(sg); - u64 dma_addr = sg_dma_address(sg); - int offset = offset_in_page(dma_addr); - __le64 *prp_list; - __le64 **list = iod_list(iod); - dma_addr_t prp_dma; - int nprps, i; - - cmd->prp1 = cpu_to_le64(dma_addr); - length -= (PAGE_SIZE - offset); - if (length <= 0) - return total_len; - - dma_len -= (PAGE_SIZE - offset); - if (dma_len) { - dma_addr += (PAGE_SIZE - offset); - } else { - sg = sg_next(sg); - dma_addr = sg_dma_address(sg); - dma_len = sg_dma_len(sg); - } - - if (length <= PAGE_SIZE) { - cmd->prp2 = cpu_to_le64(dma_addr); - return total_len; - } - - nprps = DIV_ROUND_UP(length, PAGE_SIZE); - if (nprps <= (256 / 8)) { - pool = dev->prp_small_pool; - iod->npages = 0; - } else { - pool = dev->prp_page_pool; - iod->npages = 1; - } - - prp_list = dma_pool_alloc(pool, gfp, &prp_dma); - if (!prp_list) { - cmd->prp2 = cpu_to_le64(dma_addr); - iod->npages = -1; - return (total_len - length) + PAGE_SIZE; - } - list[0] = prp_list; - iod->first_dma = prp_dma; - cmd->prp2 = cpu_to_le64(prp_dma); - i = 0; - for (;;) { - if (i == PAGE_SIZE / 8) { - __le64 *old_prp_list = prp_list; - prp_list = dma_pool_alloc(pool, gfp, &prp_dma); - if (!prp_list) - return total_len - length; - list[iod->npages++] = prp_list; - prp_list[0] = old_prp_list[i - 1]; - old_prp_list[i - 1] = cpu_to_le64(prp_dma); - i = 1; - } - prp_list[i++] = cpu_to_le64(dma_addr); - dma_len -= PAGE_SIZE; - dma_addr += PAGE_SIZE; - length -= PAGE_SIZE; - if (length <= 0) - break; - if (dma_len > 0) - continue; - BUG_ON(dma_len < 0); - sg = sg_next(sg); - dma_addr = sg_dma_address(sg); - dma_len = sg_dma_len(sg); - } - - return total_len; -} - -/* NVMe scatterlists require no holes in the virtual address */ -#define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2) ((vec2)->bv_offset || \ - (((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE)) - -static int nvme_map_bio(struct device *dev, struct nvme_iod *iod, - struct bio *bio, enum dma_data_direction dma_dir, int psegs) -{ - struct bio_vec *bvec, *bvprv = NULL; - struct scatterlist *sg = NULL; - int i, old_idx, length = 0, nsegs = 0; - - sg_init_table(iod->sg, psegs); - old_idx = bio->bi_idx; - bio_for_each_segment(bvec, bio, i) { - if (bvprv && BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) { - sg->length += bvec->bv_len; - } else { - if (bvprv && BIOVEC_NOT_VIRT_MERGEABLE(bvprv, bvec)) - break; - sg = sg ? sg + 1 : iod->sg; - sg_set_page(sg, bvec->bv_page, bvec->bv_len, - bvec->bv_offset); - nsegs++; - } - length += bvec->bv_len; - bvprv = bvec; - } - bio->bi_idx = i; - iod->nents = nsegs; - sg_mark_end(sg); - if (dma_map_sg(dev, iod->sg, iod->nents, dma_dir) == 0) { - bio->bi_idx = old_idx; - return -ENOMEM; - } - return length; -} - -/* - * We reuse the small pool to allocate the 16-byte range here as it is not - * worth having a special pool for these or additional cases to handle freeing - * the iod. - */ -static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, - struct bio *bio, struct nvme_iod *iod, int cmdid) -{ - struct nvme_dsm_range *range; - struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; - - range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC, - &iod->first_dma); - if (!range) - return -ENOMEM; - - iod_list(iod)[0] = (__le64 *)range; - iod->npages = 0; - - range->cattr = cpu_to_le32(0); - range->nlb = cpu_to_le32(bio->bi_size >> ns->lba_shift); - range->slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9)); - - memset(cmnd, 0, sizeof(*cmnd)); - cmnd->dsm.opcode = nvme_cmd_dsm; - cmnd->dsm.command_id = cmdid; - cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); - cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma); - cmnd->dsm.nr = 0; - cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); - - if (++nvmeq->sq_tail == nvmeq->q_depth) - nvmeq->sq_tail = 0; - writel(nvmeq->sq_tail, nvmeq->q_db); - - return 0; -} - -static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, - int cmdid) -{ - struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; - - memset(cmnd, 0, sizeof(*cmnd)); - cmnd->common.opcode = nvme_cmd_flush; - cmnd->common.command_id = cmdid; - cmnd->common.nsid = cpu_to_le32(ns->ns_id); - - if (++nvmeq->sq_tail == nvmeq->q_depth) - nvmeq->sq_tail = 0; - writel(nvmeq->sq_tail, nvmeq->q_db); - - return 0; -} - -static int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns) -{ - int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH, - special_completion, NVME_IO_TIMEOUT); - if (unlikely(cmdid < 0)) - return cmdid; - - return nvme_submit_flush(nvmeq, ns, cmdid); -} - -/* - * Called with local interrupts disabled and the q_lock held. May not sleep. - */ -static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, - struct bio *bio) -{ - struct nvme_command *cmnd; - struct nvme_iod *iod; - enum dma_data_direction dma_dir; - int cmdid, length, result = -ENOMEM; - u16 control; - u32 dsmgmt; - int psegs = bio_phys_segments(ns->queue, bio); - - if ((bio->bi_rw & REQ_FLUSH) && psegs) { - result = nvme_submit_flush_data(nvmeq, ns); - if (result) - return result; - } - - iod = nvme_alloc_iod(psegs, bio->bi_size, GFP_ATOMIC); - if (!iod) - goto nomem; - iod->private = bio; - - result = -EBUSY; - cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT); - if (unlikely(cmdid < 0)) - goto free_iod; - - if (bio->bi_rw & REQ_DISCARD) { - result = nvme_submit_discard(nvmeq, ns, bio, iod, cmdid); - if (result) - goto free_cmdid; - return result; - } - if ((bio->bi_rw & REQ_FLUSH) && !psegs) - return nvme_submit_flush(nvmeq, ns, cmdid); - - control = 0; - if (bio->bi_rw & REQ_FUA) - control |= NVME_RW_FUA; - if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD)) - control |= NVME_RW_LR; - - dsmgmt = 0; - if (bio->bi_rw & REQ_RAHEAD) - dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; - - cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; - - memset(cmnd, 0, sizeof(*cmnd)); - if (bio_data_dir(bio)) { - cmnd->rw.opcode = nvme_cmd_write; - dma_dir = DMA_TO_DEVICE; - } else { - cmnd->rw.opcode = nvme_cmd_read; - dma_dir = DMA_FROM_DEVICE; - } - - result = nvme_map_bio(nvmeq->q_dmadev, iod, bio, dma_dir, psegs); - if (result < 0) - goto free_cmdid; - length = result; - - cmnd->rw.command_id = cmdid; - cmnd->rw.nsid = cpu_to_le32(ns->ns_id); - length = nvme_setup_prps(nvmeq->dev, &cmnd->common, iod, length, - GFP_ATOMIC); - cmnd->rw.slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9)); - cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1); - cmnd->rw.control = cpu_to_le16(control); - cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); - - bio->bi_sector += length >> 9; - - if (++nvmeq->sq_tail == nvmeq->q_depth) - nvmeq->sq_tail = 0; - writel(nvmeq->sq_tail, nvmeq->q_db); - - return 0; - - free_cmdid: - free_cmdid(nvmeq, cmdid, NULL); - free_iod: - nvme_free_iod(nvmeq->dev, iod); - nomem: - return result; -} - -static void nvme_make_request(struct request_queue *q, struct bio *bio) -{ - struct nvme_ns *ns = q->queuedata; - struct nvme_queue *nvmeq = get_nvmeq(ns->dev); - int result = -EBUSY; - - spin_lock_irq(&nvmeq->q_lock); - if (bio_list_empty(&nvmeq->sq_cong)) - result = nvme_submit_bio_queue(nvmeq, ns, bio); - if (unlikely(result)) { - if (bio_list_empty(&nvmeq->sq_cong)) - add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); - bio_list_add(&nvmeq->sq_cong, bio); - } - - spin_unlock_irq(&nvmeq->q_lock); - put_nvmeq(nvmeq); -} - -static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq) -{ - u16 head, phase; - - head = nvmeq->cq_head; - phase = nvmeq->cq_phase; - - for (;;) { - void *ctx; - nvme_completion_fn fn; - struct nvme_completion cqe = nvmeq->cqes[head]; - if ((le16_to_cpu(cqe.status) & 1) != phase) - break; - nvmeq->sq_head = le16_to_cpu(cqe.sq_head); - if (++head == nvmeq->q_depth) { - head = 0; - phase = !phase; - } - - ctx = free_cmdid(nvmeq, cqe.command_id, &fn); - fn(nvmeq->dev, ctx, &cqe); - } - - /* If the controller ignores the cq head doorbell and continuously - * writes to the queue, it is theoretically possible to wrap around - * the queue twice and mistakenly return IRQ_NONE. Linux only - * requires that 0.1% of your interrupts are handled, so this isn't - * a big problem. - */ - if (head == nvmeq->cq_head && phase == nvmeq->cq_phase) - return IRQ_NONE; - - writel(head, nvmeq->q_db + (1 << nvmeq->dev->db_stride)); - nvmeq->cq_head = head; - nvmeq->cq_phase = phase; - - return IRQ_HANDLED; -} - -static irqreturn_t nvme_irq(int irq, void *data) -{ - irqreturn_t result; - struct nvme_queue *nvmeq = data; - spin_lock(&nvmeq->q_lock); - result = nvme_process_cq(nvmeq); - spin_unlock(&nvmeq->q_lock); - return result; -} - -static irqreturn_t nvme_irq_check(int irq, void *data) -{ - struct nvme_queue *nvmeq = data; - struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head]; - if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase) - return IRQ_NONE; - return IRQ_WAKE_THREAD; -} - -static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid) -{ - spin_lock_irq(&nvmeq->q_lock); - cancel_cmdid(nvmeq, cmdid, NULL); - spin_unlock_irq(&nvmeq->q_lock); -} - -struct sync_cmd_info { - struct task_struct *task; - u32 result; - int status; -}; - -static void sync_completion(struct nvme_dev *dev, void *ctx, - struct nvme_completion *cqe) -{ - struct sync_cmd_info *cmdinfo = ctx; - cmdinfo->result = le32_to_cpup(&cqe->result); - cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; - wake_up_process(cmdinfo->task); -} - -/* - * Returns 0 on success. If the result is negative, it's a Linux error code; - * if the result is positive, it's an NVM Express status code - */ -static int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, - struct nvme_command *cmd, u32 *result, unsigned timeout) -{ - int cmdid; - struct sync_cmd_info cmdinfo; - - cmdinfo.task = current; - cmdinfo.status = -EINTR; - - cmdid = alloc_cmdid_killable(nvmeq, &cmdinfo, sync_completion, - timeout); - if (cmdid < 0) - return cmdid; - cmd->common.command_id = cmdid; - - set_current_state(TASK_KILLABLE); - nvme_submit_cmd(nvmeq, cmd); - schedule(); - - if (cmdinfo.status == -EINTR) { - nvme_abort_command(nvmeq, cmdid); - return -EINTR; - } - - if (result) - *result = cmdinfo.result; - - return cmdinfo.status; -} - -static int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, - u32 *result) -{ - return nvme_submit_sync_cmd(dev->queues[0], cmd, result, ADMIN_TIMEOUT); -} - -static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) -{ - int status; - struct nvme_command c; - - memset(&c, 0, sizeof(c)); - c.delete_queue.opcode = opcode; - c.delete_queue.qid = cpu_to_le16(id); - - status = nvme_submit_admin_cmd(dev, &c, NULL); - if (status) - return -EIO; - return 0; -} - -static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, - struct nvme_queue *nvmeq) -{ - int status; - struct nvme_command c; - int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; - - memset(&c, 0, sizeof(c)); - c.create_cq.opcode = nvme_admin_create_cq; - c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr); - c.create_cq.cqid = cpu_to_le16(qid); - c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); - c.create_cq.cq_flags = cpu_to_le16(flags); - c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); - - status = nvme_submit_admin_cmd(dev, &c, NULL); - if (status) - return -EIO; - return 0; -} - -static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, - struct nvme_queue *nvmeq) -{ - int status; - struct nvme_command c; - int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM; - - memset(&c, 0, sizeof(c)); - c.create_sq.opcode = nvme_admin_create_sq; - c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr); - c.create_sq.sqid = cpu_to_le16(qid); - c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1); - c.create_sq.sq_flags = cpu_to_le16(flags); - c.create_sq.cqid = cpu_to_le16(qid); - - status = nvme_submit_admin_cmd(dev, &c, NULL); - if (status) - return -EIO; - return 0; -} - -static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) -{ - return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid); -} - -static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) -{ - return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); -} - -static int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns, - dma_addr_t dma_addr) -{ - struct nvme_command c; - - memset(&c, 0, sizeof(c)); - c.identify.opcode = nvme_admin_identify; - c.identify.nsid = cpu_to_le32(nsid); - c.identify.prp1 = cpu_to_le64(dma_addr); - c.identify.cns = cpu_to_le32(cns); - - return nvme_submit_admin_cmd(dev, &c, NULL); -} - -static int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, - dma_addr_t dma_addr, u32 *result) -{ - struct nvme_command c; - - memset(&c, 0, sizeof(c)); - c.features.opcode = nvme_admin_get_features; - c.features.nsid = cpu_to_le32(nsid); - c.features.prp1 = cpu_to_le64(dma_addr); - c.features.fid = cpu_to_le32(fid); - - return nvme_submit_admin_cmd(dev, &c, result); -} - -static int nvme_set_features(struct nvme_dev *dev, unsigned fid, - unsigned dword11, dma_addr_t dma_addr, u32 *result) -{ - struct nvme_command c; - - memset(&c, 0, sizeof(c)); - c.features.opcode = nvme_admin_set_features; - c.features.prp1 = cpu_to_le64(dma_addr); - c.features.fid = cpu_to_le32(fid); - c.features.dword11 = cpu_to_le32(dword11); - - return nvme_submit_admin_cmd(dev, &c, result); -} - -/** - * nvme_cancel_ios - Cancel outstanding I/Os - * @queue: The queue to cancel I/Os on - * @timeout: True to only cancel I/Os which have timed out - */ -static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout) -{ - int depth = nvmeq->q_depth - 1; - struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); - unsigned long now = jiffies; - int cmdid; - - for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) { - void *ctx; - nvme_completion_fn fn; - static struct nvme_completion cqe = { - .status = cpu_to_le16(NVME_SC_ABORT_REQ) << 1, - }; - - if (timeout && !time_after(now, info[cmdid].timeout)) - continue; - dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d\n", cmdid); - ctx = cancel_cmdid(nvmeq, cmdid, &fn); - fn(nvmeq->dev, ctx, &cqe); - } -} - -static void nvme_free_queue_mem(struct nvme_queue *nvmeq) -{ - dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), - (void *)nvmeq->cqes, nvmeq->cq_dma_addr); - dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), - nvmeq->sq_cmds, nvmeq->sq_dma_addr); - kfree(nvmeq); -} - -static void nvme_free_queue(struct nvme_dev *dev, int qid) -{ - struct nvme_queue *nvmeq = dev->queues[qid]; - int vector = dev->entry[nvmeq->cq_vector].vector; - - spin_lock_irq(&nvmeq->q_lock); - nvme_cancel_ios(nvmeq, false); - while (bio_list_peek(&nvmeq->sq_cong)) { - struct bio *bio = bio_list_pop(&nvmeq->sq_cong); - bio_endio(bio, -EIO); - } - spin_unlock_irq(&nvmeq->q_lock); - - irq_set_affinity_hint(vector, NULL); - free_irq(vector, nvmeq); - - /* Don't tell the adapter to delete the admin queue */ - if (qid) { - adapter_delete_sq(dev, qid); - adapter_delete_cq(dev, qid); - } - - nvme_free_queue_mem(nvmeq); -} - -static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, - int depth, int vector) -{ - struct device *dmadev = &dev->pci_dev->dev; - unsigned extra = DIV_ROUND_UP(depth, 8) + (depth * - sizeof(struct nvme_cmd_info)); - struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL); - if (!nvmeq) - return NULL; - - nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth), - &nvmeq->cq_dma_addr, GFP_KERNEL); - if (!nvmeq->cqes) - goto free_nvmeq; - memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth)); - - nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth), - &nvmeq->sq_dma_addr, GFP_KERNEL); - if (!nvmeq->sq_cmds) - goto free_cqdma; - - nvmeq->q_dmadev = dmadev; - nvmeq->dev = dev; - spin_lock_init(&nvmeq->q_lock); - nvmeq->cq_head = 0; - nvmeq->cq_phase = 1; - init_waitqueue_head(&nvmeq->sq_full); - init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread); - bio_list_init(&nvmeq->sq_cong); - nvmeq->q_db = &dev->dbs[qid << (dev->db_stride + 1)]; - nvmeq->q_depth = depth; - nvmeq->cq_vector = vector; - - return nvmeq; - - free_cqdma: - dma_free_coherent(dmadev, CQ_SIZE(nvmeq->q_depth), (void *)nvmeq->cqes, - nvmeq->cq_dma_addr); - free_nvmeq: - kfree(nvmeq); - return NULL; -} - -static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq, - const char *name) -{ - if (use_threaded_interrupts) - return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector, - nvme_irq_check, nvme_irq, - IRQF_DISABLED | IRQF_SHARED, - name, nvmeq); - return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq, - IRQF_DISABLED | IRQF_SHARED, name, nvmeq); -} - -static struct nvme_queue *nvme_create_queue(struct nvme_dev *dev, int qid, - int cq_size, int vector) -{ - int result; - struct nvme_queue *nvmeq = nvme_alloc_queue(dev, qid, cq_size, vector); - - if (!nvmeq) - return ERR_PTR(-ENOMEM); - - result = adapter_alloc_cq(dev, qid, nvmeq); - if (result < 0) - goto free_nvmeq; - - result = adapter_alloc_sq(dev, qid, nvmeq); - if (result < 0) - goto release_cq; - - result = queue_request_irq(dev, nvmeq, "nvme"); - if (result < 0) - goto release_sq; - - return nvmeq; - - release_sq: - adapter_delete_sq(dev, qid); - release_cq: - adapter_delete_cq(dev, qid); - free_nvmeq: - dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), - (void *)nvmeq->cqes, nvmeq->cq_dma_addr); - dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), - nvmeq->sq_cmds, nvmeq->sq_dma_addr); - kfree(nvmeq); - return ERR_PTR(result); -} - -static int nvme_configure_admin_queue(struct nvme_dev *dev) -{ - int result = 0; - u32 aqa; - u64 cap; - unsigned long timeout; - struct nvme_queue *nvmeq; - - dev->dbs = ((void __iomem *)dev->bar) + 4096; - - nvmeq = nvme_alloc_queue(dev, 0, 64, 0); - if (!nvmeq) - return -ENOMEM; - - aqa = nvmeq->q_depth - 1; - aqa |= aqa << 16; - - dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM; - dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT; - dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; - dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; - - writel(0, &dev->bar->cc); - writel(aqa, &dev->bar->aqa); - writeq(nvmeq->sq_dma_addr, &dev->bar->asq); - writeq(nvmeq->cq_dma_addr, &dev->bar->acq); - writel(dev->ctrl_config, &dev->bar->cc); - - cap = readq(&dev->bar->cap); - timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; - dev->db_stride = NVME_CAP_STRIDE(cap); - - while (!result && !(readl(&dev->bar->csts) & NVME_CSTS_RDY)) { - msleep(100); - if (fatal_signal_pending(current)) - result = -EINTR; - if (time_after(jiffies, timeout)) { - dev_err(&dev->pci_dev->dev, - "Device not ready; aborting initialisation\n"); - result = -ENODEV; - } - } - - if (result) { - nvme_free_queue_mem(nvmeq); - return result; - } - - result = queue_request_irq(dev, nvmeq, "nvme admin"); - dev->queues[0] = nvmeq; - return result; -} - -static struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, - unsigned long addr, unsigned length) -{ - int i, err, count, nents, offset; - struct scatterlist *sg; - struct page **pages; - struct nvme_iod *iod; - - if (addr & 3) - return ERR_PTR(-EINVAL); - if (!length) - return ERR_PTR(-EINVAL); - - offset = offset_in_page(addr); - count = DIV_ROUND_UP(offset + length, PAGE_SIZE); - pages = kcalloc(count, sizeof(*pages), GFP_KERNEL); - if (!pages) - return ERR_PTR(-ENOMEM); - - err = get_user_pages_fast(addr, count, 1, pages); - if (err < count) { - count = err; - err = -EFAULT; - goto put_pages; - } - - iod = nvme_alloc_iod(count, length, GFP_KERNEL); - sg = iod->sg; - sg_init_table(sg, count); - for (i = 0; i < count; i++) { - sg_set_page(&sg[i], pages[i], - min_t(int, length, PAGE_SIZE - offset), offset); - length -= (PAGE_SIZE - offset); - offset = 0; - } - sg_mark_end(&sg[i - 1]); - iod->nents = count; - - err = -ENOMEM; - nents = dma_map_sg(&dev->pci_dev->dev, sg, count, - write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - if (!nents) - goto free_iod; - - kfree(pages); - return iod; - - free_iod: - kfree(iod); - put_pages: - for (i = 0; i < count; i++) - put_page(pages[i]); - kfree(pages); - return ERR_PTR(err); -} - -static void nvme_unmap_user_pages(struct nvme_dev *dev, int write, - struct nvme_iod *iod) -{ - int i; - - dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, - write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - - for (i = 0; i < iod->nents; i++) - put_page(sg_page(&iod->sg[i])); -} - -static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) -{ - struct nvme_dev *dev = ns->dev; - struct nvme_queue *nvmeq; - struct nvme_user_io io; - struct nvme_command c; - unsigned length; - int status; - struct nvme_iod *iod; - - if (copy_from_user(&io, uio, sizeof(io))) - return -EFAULT; - length = (io.nblocks + 1) << ns->lba_shift; - - switch (io.opcode) { - case nvme_cmd_write: - case nvme_cmd_read: - case nvme_cmd_compare: - iod = nvme_map_user_pages(dev, io.opcode & 1, io.addr, length); - break; - default: - return -EINVAL; - } - - if (IS_ERR(iod)) - return PTR_ERR(iod); - - memset(&c, 0, sizeof(c)); - c.rw.opcode = io.opcode; - c.rw.flags = io.flags; - c.rw.nsid = cpu_to_le32(ns->ns_id); - c.rw.slba = cpu_to_le64(io.slba); - c.rw.length = cpu_to_le16(io.nblocks); - c.rw.control = cpu_to_le16(io.control); - c.rw.dsmgmt = cpu_to_le16(io.dsmgmt); - c.rw.reftag = io.reftag; - c.rw.apptag = io.apptag; - c.rw.appmask = io.appmask; - /* XXX: metadata */ - length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL); - - nvmeq = get_nvmeq(dev); - /* - * Since nvme_submit_sync_cmd sleeps, we can't keep preemption - * disabled. We may be preempted at any point, and be rescheduled - * to a different CPU. That will cause cacheline bouncing, but no - * additional races since q_lock already protects against other CPUs. - */ - put_nvmeq(nvmeq); - if (length != (io.nblocks + 1) << ns->lba_shift) - status = -ENOMEM; - else - status = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT); - - nvme_unmap_user_pages(dev, io.opcode & 1, iod); - nvme_free_iod(dev, iod); - return status; -} - -static int nvme_user_admin_cmd(struct nvme_dev *dev, - struct nvme_admin_cmd __user *ucmd) -{ - struct nvme_admin_cmd cmd; - struct nvme_command c; - int status, length; - struct nvme_iod *uninitialized_var(iod); - - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (copy_from_user(&cmd, ucmd, sizeof(cmd))) - return -EFAULT; - - memset(&c, 0, sizeof(c)); - c.common.opcode = cmd.opcode; - c.common.flags = cmd.flags; - c.common.nsid = cpu_to_le32(cmd.nsid); - c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); - c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); - c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); - c.common.cdw10[1] = cpu_to_le32(cmd.cdw11); - c.common.cdw10[2] = cpu_to_le32(cmd.cdw12); - c.common.cdw10[3] = cpu_to_le32(cmd.cdw13); - c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); - c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); - - length = cmd.data_len; - if (cmd.data_len) { - iod = nvme_map_user_pages(dev, cmd.opcode & 1, cmd.addr, - length); - if (IS_ERR(iod)) - return PTR_ERR(iod); - length = nvme_setup_prps(dev, &c.common, iod, length, - GFP_KERNEL); - } - - if (length != cmd.data_len) - status = -ENOMEM; - else - status = nvme_submit_admin_cmd(dev, &c, &cmd.result); - - if (cmd.data_len) { - nvme_unmap_user_pages(dev, cmd.opcode & 1, iod); - nvme_free_iod(dev, iod); - } - - if (!status && copy_to_user(&ucmd->result, &cmd.result, - sizeof(cmd.result))) - status = -EFAULT; - - return status; -} - -static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, - unsigned long arg) -{ - struct nvme_ns *ns = bdev->bd_disk->private_data; - - switch (cmd) { - case NVME_IOCTL_ID: - return ns->ns_id; - case NVME_IOCTL_ADMIN_CMD: - return nvme_user_admin_cmd(ns->dev, (void __user *)arg); - case NVME_IOCTL_SUBMIT_IO: - return nvme_submit_io(ns, (void __user *)arg); - default: - return -ENOTTY; - } -} - -static const struct block_device_operations nvme_fops = { - .owner = THIS_MODULE, - .ioctl = nvme_ioctl, - .compat_ioctl = nvme_ioctl, -}; - -static void nvme_resubmit_bios(struct nvme_queue *nvmeq) -{ - while (bio_list_peek(&nvmeq->sq_cong)) { - struct bio *bio = bio_list_pop(&nvmeq->sq_cong); - struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data; - if (nvme_submit_bio_queue(nvmeq, ns, bio)) { - bio_list_add_head(&nvmeq->sq_cong, bio); - break; - } - if (bio_list_empty(&nvmeq->sq_cong)) - remove_wait_queue(&nvmeq->sq_full, - &nvmeq->sq_cong_wait); - } -} - -static int nvme_kthread(void *data) -{ - struct nvme_dev *dev; - - while (!kthread_should_stop()) { - __set_current_state(TASK_RUNNING); - spin_lock(&dev_list_lock); - list_for_each_entry(dev, &dev_list, node) { - int i; - for (i = 0; i < dev->queue_count; i++) { - struct nvme_queue *nvmeq = dev->queues[i]; - if (!nvmeq) - continue; - spin_lock_irq(&nvmeq->q_lock); - if (nvme_process_cq(nvmeq)) - printk("process_cq did something\n"); - nvme_cancel_ios(nvmeq, true); - nvme_resubmit_bios(nvmeq); - spin_unlock_irq(&nvmeq->q_lock); - } - } - spin_unlock(&dev_list_lock); - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ); - } - return 0; -} - -static DEFINE_IDA(nvme_index_ida); - -static int nvme_get_ns_idx(void) -{ - int index, error; - - do { - if (!ida_pre_get(&nvme_index_ida, GFP_KERNEL)) - return -1; - - spin_lock(&dev_list_lock); - error = ida_get_new(&nvme_index_ida, &index); - spin_unlock(&dev_list_lock); - } while (error == -EAGAIN); - - if (error) - index = -1; - return index; -} - -static void nvme_put_ns_idx(int index) -{ - spin_lock(&dev_list_lock); - ida_remove(&nvme_index_ida, index); - spin_unlock(&dev_list_lock); -} - -static void nvme_config_discard(struct nvme_ns *ns) -{ - u32 logical_block_size = queue_logical_block_size(ns->queue); - ns->queue->limits.discard_zeroes_data = 0; - ns->queue->limits.discard_alignment = logical_block_size; - ns->queue->limits.discard_granularity = logical_block_size; - ns->queue->limits.max_discard_sectors = 0xffffffff; - queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); -} - -static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid, - struct nvme_id_ns *id, struct nvme_lba_range_type *rt) -{ - struct nvme_ns *ns; - struct gendisk *disk; - int lbaf; - - if (rt->attributes & NVME_LBART_ATTRIB_HIDE) - return NULL; - - ns = kzalloc(sizeof(*ns), GFP_KERNEL); - if (!ns) - return NULL; - ns->queue = blk_alloc_queue(GFP_KERNEL); - if (!ns->queue) - goto out_free_ns; - ns->queue->queue_flags = QUEUE_FLAG_DEFAULT; - queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); - queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); - blk_queue_make_request(ns->queue, nvme_make_request); - ns->dev = dev; - ns->queue->queuedata = ns; - - disk = alloc_disk(NVME_MINORS); - if (!disk) - goto out_free_queue; - ns->ns_id = nsid; - ns->disk = disk; - lbaf = id->flbas & 0xf; - ns->lba_shift = id->lbaf[lbaf].ds; - blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); - if (dev->max_hw_sectors) - blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); - - disk->major = nvme_major; - disk->minors = NVME_MINORS; - disk->first_minor = NVME_MINORS * nvme_get_ns_idx(); - disk->fops = &nvme_fops; - disk->private_data = ns; - disk->queue = ns->queue; - disk->driverfs_dev = &dev->pci_dev->dev; - sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid); - set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); - - if (dev->oncs & NVME_CTRL_ONCS_DSM) - nvme_config_discard(ns); - - return ns; - - out_free_queue: - blk_cleanup_queue(ns->queue); - out_free_ns: - kfree(ns); - return NULL; -} - -static void nvme_ns_free(struct nvme_ns *ns) -{ - int index = ns->disk->first_minor / NVME_MINORS; - put_disk(ns->disk); - nvme_put_ns_idx(index); - blk_cleanup_queue(ns->queue); - kfree(ns); -} - -static int set_queue_count(struct nvme_dev *dev, int count) -{ - int status; - u32 result; - u32 q_count = (count - 1) | ((count - 1) << 16); - - status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0, - &result); - if (status) - return -EIO; - return min(result & 0xffff, result >> 16) + 1; -} - -static int nvme_setup_io_queues(struct nvme_dev *dev) -{ - int result, cpu, i, nr_io_queues, db_bar_size, q_depth; - - nr_io_queues = num_online_cpus(); - result = set_queue_count(dev, nr_io_queues); - if (result < 0) - return result; - if (result < nr_io_queues) - nr_io_queues = result; - - /* Deregister the admin queue's interrupt */ - free_irq(dev->entry[0].vector, dev->queues[0]); - - db_bar_size = 4096 + ((nr_io_queues + 1) << (dev->db_stride + 3)); - if (db_bar_size > 8192) { - iounmap(dev->bar); - dev->bar = ioremap(pci_resource_start(dev->pci_dev, 0), - db_bar_size); - dev->dbs = ((void __iomem *)dev->bar) + 4096; - dev->queues[0]->q_db = dev->dbs; - } - - for (i = 0; i < nr_io_queues; i++) - dev->entry[i].entry = i; - for (;;) { - result = pci_enable_msix(dev->pci_dev, dev->entry, - nr_io_queues); - if (result == 0) { - break; - } else if (result > 0) { - nr_io_queues = result; - continue; - } else { - nr_io_queues = 1; - break; - } - } - - result = queue_request_irq(dev, dev->queues[0], "nvme admin"); - /* XXX: handle failure here */ - - cpu = cpumask_first(cpu_online_mask); - for (i = 0; i < nr_io_queues; i++) { - irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu)); - cpu = cpumask_next(cpu, cpu_online_mask); - } - - q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1, - NVME_Q_DEPTH); - for (i = 0; i < nr_io_queues; i++) { - dev->queues[i + 1] = nvme_create_queue(dev, i + 1, q_depth, i); - if (IS_ERR(dev->queues[i + 1])) - return PTR_ERR(dev->queues[i + 1]); - dev->queue_count++; - } - - for (; i < num_possible_cpus(); i++) { - int target = i % rounddown_pow_of_two(dev->queue_count - 1); - dev->queues[i + 1] = dev->queues[target + 1]; - } - - return 0; -} - -static void nvme_free_queues(struct nvme_dev *dev) -{ - int i; - - for (i = dev->queue_count - 1; i >= 0; i--) - nvme_free_queue(dev, i); -} - -static int nvme_dev_add(struct nvme_dev *dev) -{ - int res, nn, i; - struct nvme_ns *ns, *next; - struct nvme_id_ctrl *ctrl; - struct nvme_id_ns *id_ns; - void *mem; - dma_addr_t dma_addr; - - res = nvme_setup_io_queues(dev); - if (res) - return res; - - mem = dma_alloc_coherent(&dev->pci_dev->dev, 8192, &dma_addr, - GFP_KERNEL); - - res = nvme_identify(dev, 0, 1, dma_addr); - if (res) { - res = -EIO; - goto out_free; - } - - ctrl = mem; - nn = le32_to_cpup(&ctrl->nn); - dev->oncs = le16_to_cpup(&ctrl->oncs); - memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); - memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); - memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); - if (ctrl->mdts) { - int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12; - dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9); - } - - id_ns = mem; - for (i = 1; i <= nn; i++) { - res = nvme_identify(dev, i, 0, dma_addr); - if (res) - continue; - - if (id_ns->ncap == 0) - continue; - - res = nvme_get_features(dev, NVME_FEAT_LBA_RANGE, i, - dma_addr + 4096, NULL); - if (res) - memset(mem + 4096, 0, 4096); - - ns = nvme_alloc_ns(dev, i, mem, mem + 4096); - if (ns) - list_add_tail(&ns->list, &dev->namespaces); - } - list_for_each_entry(ns, &dev->namespaces, list) - add_disk(ns->disk); - - goto out; - - out_free: - list_for_each_entry_safe(ns, next, &dev->namespaces, list) { - list_del(&ns->list); - nvme_ns_free(ns); - } - - out: - dma_free_coherent(&dev->pci_dev->dev, 8192, mem, dma_addr); - return res; -} - -static int nvme_dev_remove(struct nvme_dev *dev) -{ - struct nvme_ns *ns, *next; - - spin_lock(&dev_list_lock); - list_del(&dev->node); - spin_unlock(&dev_list_lock); - - list_for_each_entry_safe(ns, next, &dev->namespaces, list) { - list_del(&ns->list); - del_gendisk(ns->disk); - nvme_ns_free(ns); - } - - nvme_free_queues(dev); - - return 0; -} - -static int nvme_setup_prp_pools(struct nvme_dev *dev) -{ - struct device *dmadev = &dev->pci_dev->dev; - dev->prp_page_pool = dma_pool_create("prp list page", dmadev, - PAGE_SIZE, PAGE_SIZE, 0); - if (!dev->prp_page_pool) - return -ENOMEM; - - /* Optimisation for I/Os between 4k and 128k */ - dev->prp_small_pool = dma_pool_create("prp list 256", dmadev, - 256, 256, 0); - if (!dev->prp_small_pool) { - dma_pool_destroy(dev->prp_page_pool); - return -ENOMEM; - } - return 0; -} - -static void nvme_release_prp_pools(struct nvme_dev *dev) -{ - dma_pool_destroy(dev->prp_page_pool); - dma_pool_destroy(dev->prp_small_pool); -} - -static DEFINE_IDA(nvme_instance_ida); - -static int nvme_set_instance(struct nvme_dev *dev) -{ - int instance, error; - - do { - if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL)) - return -ENODEV; - - spin_lock(&dev_list_lock); - error = ida_get_new(&nvme_instance_ida, &instance); - spin_unlock(&dev_list_lock); - } while (error == -EAGAIN); - - if (error) - return -ENODEV; - - dev->instance = instance; - return 0; -} - -static void nvme_release_instance(struct nvme_dev *dev) -{ - spin_lock(&dev_list_lock); - ida_remove(&nvme_instance_ida, dev->instance); - spin_unlock(&dev_list_lock); -} - -static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) -{ - int bars, result = -ENOMEM; - struct nvme_dev *dev; - - dev = kzalloc(sizeof(*dev), GFP_KERNEL); - if (!dev) - return -ENOMEM; - dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry), - GFP_KERNEL); - if (!dev->entry) - goto free; - dev->queues = kcalloc(num_possible_cpus() + 1, sizeof(void *), - GFP_KERNEL); - if (!dev->queues) - goto free; - - if (pci_enable_device_mem(pdev)) - goto free; - pci_set_master(pdev); - bars = pci_select_bars(pdev, IORESOURCE_MEM); - if (pci_request_selected_regions(pdev, bars, "nvme")) - goto disable; - - INIT_LIST_HEAD(&dev->namespaces); - dev->pci_dev = pdev; - pci_set_drvdata(pdev, dev); - dma_set_mask(&pdev->dev, DMA_BIT_MASK(64)); - dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64)); - result = nvme_set_instance(dev); - if (result) - goto disable; - - dev->entry[0].vector = pdev->irq; - - result = nvme_setup_prp_pools(dev); - if (result) - goto disable_msix; - - dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); - if (!dev->bar) { - result = -ENOMEM; - goto disable_msix; - } - - result = nvme_configure_admin_queue(dev); - if (result) - goto unmap; - dev->queue_count++; - - spin_lock(&dev_list_lock); - list_add(&dev->node, &dev_list); - spin_unlock(&dev_list_lock); - - result = nvme_dev_add(dev); - if (result) - goto delete; - - return 0; - - delete: - spin_lock(&dev_list_lock); - list_del(&dev->node); - spin_unlock(&dev_list_lock); - - nvme_free_queues(dev); - unmap: - iounmap(dev->bar); - disable_msix: - pci_disable_msix(pdev); - nvme_release_instance(dev); - nvme_release_prp_pools(dev); - disable: - pci_disable_device(pdev); - pci_release_regions(pdev); - free: - kfree(dev->queues); - kfree(dev->entry); - kfree(dev); - return result; -} - -static void nvme_remove(struct pci_dev *pdev) -{ - struct nvme_dev *dev = pci_get_drvdata(pdev); - nvme_dev_remove(dev); - pci_disable_msix(pdev); - iounmap(dev->bar); - nvme_release_instance(dev); - nvme_release_prp_pools(dev); - pci_disable_device(pdev); - pci_release_regions(pdev); - kfree(dev->queues); - kfree(dev->entry); - kfree(dev); -} - -/* These functions are yet to be implemented */ -#define nvme_error_detected NULL -#define nvme_dump_registers NULL -#define nvme_link_reset NULL -#define nvme_slot_reset NULL -#define nvme_error_resume NULL -#define nvme_suspend NULL -#define nvme_resume NULL - -static const struct pci_error_handlers nvme_err_handler = { - .error_detected = nvme_error_detected, - .mmio_enabled = nvme_dump_registers, - .link_reset = nvme_link_reset, - .slot_reset = nvme_slot_reset, - .resume = nvme_error_resume, -}; - -/* Move to pci_ids.h later */ -#define PCI_CLASS_STORAGE_EXPRESS 0x010802 - -static DEFINE_PCI_DEVICE_TABLE(nvme_id_table) = { - { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, - { 0, } -}; -MODULE_DEVICE_TABLE(pci, nvme_id_table); - -static struct pci_driver nvme_driver = { - .name = "nvme", - .id_table = nvme_id_table, - .probe = nvme_probe, - .remove = nvme_remove, - .suspend = nvme_suspend, - .resume = nvme_resume, - .err_handler = &nvme_err_handler, -}; - -static int __init nvme_init(void) -{ - int result; - - nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); - if (IS_ERR(nvme_thread)) - return PTR_ERR(nvme_thread); - - result = register_blkdev(nvme_major, "nvme"); - if (result < 0) - goto kill_kthread; - else if (result > 0) - nvme_major = result; - - result = pci_register_driver(&nvme_driver); - if (result) - goto unregister_blkdev; - return 0; - - unregister_blkdev: - unregister_blkdev(nvme_major, "nvme"); - kill_kthread: - kthread_stop(nvme_thread); - return result; -} - -static void __exit nvme_exit(void) -{ - pci_unregister_driver(&nvme_driver); - unregister_blkdev(nvme_major, "nvme"); - kthread_stop(nvme_thread); -} - -MODULE_AUTHOR("Matthew Wilcox "); -MODULE_LICENSE("GPL"); -MODULE_VERSION("0.8"); -module_init(nvme_init); -module_exit(nvme_exit); -- cgit v1.2.3 From 13c3b0fcc8e33ba49f252378f6e7290b146042af Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Mon, 4 Mar 2013 18:40:57 -0700 Subject: NVMe: Move structures & definitions to header file nvme-scsi.c uses several data structures and definitions that were previously private to nvme-core.c. Move the definitions to nvme.h, protected by __KERNEL__. Signed-off-by: Vishal Verma Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 55 ------------------------------------------- include/linux/nvme.h | 60 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 55 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 26e26607207..1f98040cf67 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -46,7 +46,6 @@ #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) #define NVME_MINORS 64 -#define NVME_IO_TIMEOUT (5 * HZ) #define ADMIN_TIMEOUT (60 * HZ) static int nvme_major; @@ -59,44 +58,6 @@ static DEFINE_SPINLOCK(dev_list_lock); static LIST_HEAD(dev_list); static struct task_struct *nvme_thread; -/* - * Represents an NVM Express device. Each nvme_dev is a PCI function. - */ -struct nvme_dev { - struct list_head node; - struct nvme_queue **queues; - u32 __iomem *dbs; - struct pci_dev *pci_dev; - struct dma_pool *prp_page_pool; - struct dma_pool *prp_small_pool; - int instance; - int queue_count; - int db_stride; - u32 ctrl_config; - struct msix_entry *entry; - struct nvme_bar __iomem *bar; - struct list_head namespaces; - char serial[20]; - char model[40]; - char firmware_rev[8]; - u32 max_hw_sectors; - u16 oncs; -}; - -/* - * An NVM Express namespace is equivalent to a SCSI LUN - */ -struct nvme_ns { - struct list_head list; - - struct nvme_dev *dev; - struct request_queue *queue; - struct gendisk *disk; - - int ns_id; - int lba_shift; -}; - /* * An NVM Express queue. Each device has at least two (one for admin * commands and one for I/O commands). @@ -295,22 +256,6 @@ static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) return 0; } -/* - * The nvme_iod describes the data in an I/O, including the list of PRP - * entries. You can't see it in this data structure because C doesn't let - * me express that. Use nvme_alloc_iod to ensure there's enough space - * allocated to store the PRP list. - */ -struct nvme_iod { - void *private; /* For the use of the submitter of the I/O */ - int npages; /* In the PRP list. 0 means small pool in use */ - int offset; /* Of PRP list */ - int nents; /* Used in scatterlist */ - int length; /* Of data, in bytes */ - dma_addr_t first_dma; - struct scatterlist sg[0]; -}; - static __le64 **iod_list(struct nvme_iod *iod) { return ((void *)iod) + iod->offset; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index bde44c1fd21..6f899add14a 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -493,4 +493,64 @@ struct nvme_admin_cmd { #define NVME_IOCTL_ADMIN_CMD _IOWR('N', 0x41, struct nvme_admin_cmd) #define NVME_IOCTL_SUBMIT_IO _IOW('N', 0x42, struct nvme_user_io) +#ifdef __KERNEL__ +#include + +#define NVME_IO_TIMEOUT (5 * HZ) + +/* + * Represents an NVM Express device. Each nvme_dev is a PCI function. + */ +struct nvme_dev { + struct list_head node; + struct nvme_queue **queues; + u32 __iomem *dbs; + struct pci_dev *pci_dev; + struct dma_pool *prp_page_pool; + struct dma_pool *prp_small_pool; + int instance; + int queue_count; + int db_stride; + u32 ctrl_config; + struct msix_entry *entry; + struct nvme_bar __iomem *bar; + struct list_head namespaces; + char serial[20]; + char model[40]; + char firmware_rev[8]; + u32 max_hw_sectors; + u16 oncs; +}; + +/* + * An NVM Express namespace is equivalent to a SCSI LUN + */ +struct nvme_ns { + struct list_head list; + + struct nvme_dev *dev; + struct request_queue *queue; + struct gendisk *disk; + + int ns_id; + int lba_shift; +}; + +/* + * The nvme_iod describes the data in an I/O, including the list of PRP + * entries. You can't see it in this data structure because C doesn't let + * me express that. Use nvme_alloc_iod to ensure there's enough space + * allocated to store the PRP list. + */ +struct nvme_iod { + void *private; /* For the use of the submitter of the I/O */ + int npages; /* In the PRP list. 0 means small pool in use */ + int offset; /* Of PRP list */ + int nents; /* Used in scatterlist */ + int length; /* Of data, in bytes */ + dma_addr_t first_dma; + struct scatterlist sg[0]; +}; +#endif + #endif /* _LINUX_NVME_H */ -- cgit v1.2.3 From f8ebf8409abfdaeeb8c847381629a2a8b8e3d816 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Wed, 27 Mar 2013 07:13:41 -0400 Subject: NVMe: Add definitions for format command The SCSI emulation has the ability to send format commands, so we need to add the definition of the command. Also add a missing error code. Signed-off-by: Vishal Verma Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 1 + include/linux/nvme.h | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 1f98040cf67..d0cfb85d558 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -93,6 +93,7 @@ static inline void _nvme_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); BUILD_BUG_ON(sizeof(struct nvme_features) != 64); + BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64); BUILD_BUG_ON(sizeof(struct nvme_command) != 64); BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 6f899add14a..f1974cab60c 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -393,6 +393,16 @@ struct nvme_download_firmware { __u32 rsvd12[4]; }; +struct nvme_format_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[4]; + __le32 cdw10; + __u32 rsvd11[5]; +}; + struct nvme_command { union { struct nvme_common_command common; @@ -403,6 +413,7 @@ struct nvme_command { struct nvme_create_sq create_sq; struct nvme_delete_queue delete_queue; struct nvme_download_firmware dlfw; + struct nvme_format_cmd format; struct nvme_dsm_cmd dsm; }; }; @@ -420,6 +431,7 @@ enum { NVME_SC_FUSED_FAIL = 0x9, NVME_SC_FUSED_MISSING = 0xa, NVME_SC_INVALID_NS = 0xb, + NVME_SC_CMD_SEQ_ERROR = 0xc, NVME_SC_LBA_RANGE = 0x80, NVME_SC_CAP_EXCEEDED = 0x81, NVME_SC_NS_NOT_READY = 0x82, -- cgit v1.2.3 From 5d0f6131a79adfa1fb51309c5f81a2a4ef879dd4 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Mon, 4 Mar 2013 18:40:58 -0700 Subject: NVMe: Add nvme-scsi.c Translates SCSI commands in SG_IO ioctl to NVMe commands. Uses the scsi-nvme translation spec from nvmexpress.org as reference. Signed-off-by: Vishal Verma Signed-off-by: Matthew Wilcox --- drivers/block/Makefile | 2 +- drivers/block/nvme-core.c | 37 +- drivers/block/nvme-scsi.c | 2941 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/nvme.h | 35 + 4 files changed, 2997 insertions(+), 18 deletions(-) create mode 100644 drivers/block/nvme-scsi.c diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 2a41c86d3ad..ca07399a8d9 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -42,5 +42,5 @@ obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/ obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/ -nvme-y := nvme-core.o +nvme-y := nvme-core.o nvme-scsi.o swim_mod-y := swim.o swim_asm.o diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index d0cfb85d558..a89f7dbefba 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -39,7 +39,7 @@ #include #include #include - +#include #include #define NVME_Q_DEPTH 1024 @@ -224,12 +224,12 @@ static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid, return ctx; } -static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) +struct nvme_queue *get_nvmeq(struct nvme_dev *dev) { return dev->queues[get_cpu() + 1]; } -static void put_nvmeq(struct nvme_queue *nvmeq) +void put_nvmeq(struct nvme_queue *nvmeq) { put_cpu(); } @@ -290,7 +290,7 @@ nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp) return iod; } -static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) +void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) { const int last_prp = PAGE_SIZE / 8 - 1; int i; @@ -339,9 +339,8 @@ static void bio_completion(struct nvme_dev *dev, void *ctx, } /* length is in bytes. gfp flags indicates whether we may sleep. */ -static int nvme_setup_prps(struct nvme_dev *dev, - struct nvme_common_command *cmd, struct nvme_iod *iod, - int total_len, gfp_t gfp) +int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd, + struct nvme_iod *iod, int total_len, gfp_t gfp) { struct dma_pool *pool; int length = total_len; @@ -512,7 +511,7 @@ static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, return 0; } -static int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns) +int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns) { int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH, special_completion, NVME_IO_TIMEOUT); @@ -715,8 +714,8 @@ static void sync_completion(struct nvme_dev *dev, void *ctx, * Returns 0 on success. If the result is negative, it's a Linux error code; * if the result is positive, it's an NVM Express status code */ -static int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, - struct nvme_command *cmd, u32 *result, unsigned timeout) +int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd, + u32 *result, unsigned timeout) { int cmdid; struct sync_cmd_info cmdinfo; @@ -745,7 +744,7 @@ static int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, return cmdinfo.status; } -static int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, +int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, u32 *result) { return nvme_submit_sync_cmd(dev->queues[0], cmd, result, ADMIN_TIMEOUT); @@ -818,7 +817,7 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); } -static int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns, +int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns, dma_addr_t dma_addr) { struct nvme_command c; @@ -832,7 +831,7 @@ static int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns, return nvme_submit_admin_cmd(dev, &c, NULL); } -static int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, +int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, dma_addr_t dma_addr, u32 *result) { struct nvme_command c; @@ -846,8 +845,8 @@ static int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, return nvme_submit_admin_cmd(dev, &c, result); } -static int nvme_set_features(struct nvme_dev *dev, unsigned fid, - unsigned dword11, dma_addr_t dma_addr, u32 *result) +int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, + dma_addr_t dma_addr, u32 *result) { struct nvme_command c; @@ -1065,7 +1064,7 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) return result; } -static struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, +struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, unsigned long addr, unsigned length) { int i, err, count, nents, offset; @@ -1121,7 +1120,7 @@ static struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, return ERR_PTR(err); } -static void nvme_unmap_user_pages(struct nvme_dev *dev, int write, +void nvme_unmap_user_pages(struct nvme_dev *dev, int write, struct nvme_iod *iod) { int i; @@ -1257,6 +1256,10 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, return nvme_user_admin_cmd(ns->dev, (void __user *)arg); case NVME_IOCTL_SUBMIT_IO: return nvme_submit_io(ns, (void __user *)arg); + case SG_GET_VERSION_NUM: + return nvme_sg_get_version_num((void __user *)arg); + case SG_IO: + return nvme_sg_io(ns, (void __user *)arg); default: return -ENOTTY; } diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c new file mode 100644 index 00000000000..483af3585c9 --- /dev/null +++ b/drivers/block/nvme-scsi.c @@ -0,0 +1,2941 @@ +/* + * NVM Express device driver + * Copyright (c) 2011, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +/* + * Refer to the SCSI-NVMe Translation spec for details on how + * each command is translated. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +static int sg_version_num = 30534; /* 2 digits for each component */ + +#define SNTI_TRANSLATION_SUCCESS 0 +#define SNTI_INTERNAL_ERROR 1 + +/* VPD Page Codes */ +#define VPD_SUPPORTED_PAGES 0x00 +#define VPD_SERIAL_NUMBER 0x80 +#define VPD_DEVICE_IDENTIFIERS 0x83 +#define VPD_EXTENDED_INQUIRY 0x86 +#define VPD_BLOCK_DEV_CHARACTERISTICS 0xB1 + +/* CDB offsets */ +#define REPORT_LUNS_CDB_ALLOC_LENGTH_OFFSET 6 +#define REPORT_LUNS_SR_OFFSET 2 +#define READ_CAP_16_CDB_ALLOC_LENGTH_OFFSET 10 +#define REQUEST_SENSE_CDB_ALLOC_LENGTH_OFFSET 4 +#define REQUEST_SENSE_DESC_OFFSET 1 +#define REQUEST_SENSE_DESC_MASK 0x01 +#define DESCRIPTOR_FORMAT_SENSE_DATA_TYPE 1 +#define INQUIRY_EVPD_BYTE_OFFSET 1 +#define INQUIRY_PAGE_CODE_BYTE_OFFSET 2 +#define INQUIRY_EVPD_BIT_MASK 1 +#define INQUIRY_CDB_ALLOCATION_LENGTH_OFFSET 3 +#define START_STOP_UNIT_CDB_IMMED_OFFSET 1 +#define START_STOP_UNIT_CDB_IMMED_MASK 0x1 +#define START_STOP_UNIT_CDB_POWER_COND_MOD_OFFSET 3 +#define START_STOP_UNIT_CDB_POWER_COND_MOD_MASK 0xF +#define START_STOP_UNIT_CDB_POWER_COND_OFFSET 4 +#define START_STOP_UNIT_CDB_POWER_COND_MASK 0xF0 +#define START_STOP_UNIT_CDB_NO_FLUSH_OFFSET 4 +#define START_STOP_UNIT_CDB_NO_FLUSH_MASK 0x4 +#define START_STOP_UNIT_CDB_START_OFFSET 4 +#define START_STOP_UNIT_CDB_START_MASK 0x1 +#define WRITE_BUFFER_CDB_MODE_OFFSET 1 +#define WRITE_BUFFER_CDB_MODE_MASK 0x1F +#define WRITE_BUFFER_CDB_BUFFER_ID_OFFSET 2 +#define WRITE_BUFFER_CDB_BUFFER_OFFSET_OFFSET 3 +#define WRITE_BUFFER_CDB_PARM_LIST_LENGTH_OFFSET 6 +#define FORMAT_UNIT_CDB_FORMAT_PROT_INFO_OFFSET 1 +#define FORMAT_UNIT_CDB_FORMAT_PROT_INFO_MASK 0xC0 +#define FORMAT_UNIT_CDB_FORMAT_PROT_INFO_SHIFT 6 +#define FORMAT_UNIT_CDB_LONG_LIST_OFFSET 1 +#define FORMAT_UNIT_CDB_LONG_LIST_MASK 0x20 +#define FORMAT_UNIT_CDB_FORMAT_DATA_OFFSET 1 +#define FORMAT_UNIT_CDB_FORMAT_DATA_MASK 0x10 +#define FORMAT_UNIT_SHORT_PARM_LIST_LEN 4 +#define FORMAT_UNIT_LONG_PARM_LIST_LEN 8 +#define FORMAT_UNIT_PROT_INT_OFFSET 3 +#define FORMAT_UNIT_PROT_FIELD_USAGE_OFFSET 0 +#define FORMAT_UNIT_PROT_FIELD_USAGE_MASK 0x07 + +/* Misc. defines */ +#define NIBBLE_SHIFT 4 +#define FIXED_SENSE_DATA 0x70 +#define DESC_FORMAT_SENSE_DATA 0x72 +#define FIXED_SENSE_DATA_ADD_LENGTH 10 +#define LUN_ENTRY_SIZE 8 +#define LUN_DATA_HEADER_SIZE 8 +#define ALL_LUNS_RETURNED 0x02 +#define ALL_WELL_KNOWN_LUNS_RETURNED 0x01 +#define RESTRICTED_LUNS_RETURNED 0x00 +#define NVME_POWER_STATE_START_VALID 0x00 +#define NVME_POWER_STATE_ACTIVE 0x01 +#define NVME_POWER_STATE_IDLE 0x02 +#define NVME_POWER_STATE_STANDBY 0x03 +#define NVME_POWER_STATE_LU_CONTROL 0x07 +#define POWER_STATE_0 0 +#define POWER_STATE_1 1 +#define POWER_STATE_2 2 +#define POWER_STATE_3 3 +#define DOWNLOAD_SAVE_ACTIVATE 0x05 +#define DOWNLOAD_SAVE_DEFER_ACTIVATE 0x0E +#define ACTIVATE_DEFERRED_MICROCODE 0x0F +#define FORMAT_UNIT_IMMED_MASK 0x2 +#define FORMAT_UNIT_IMMED_OFFSET 1 +#define KELVIN_TEMP_FACTOR 273 +#define FIXED_FMT_SENSE_DATA_SIZE 18 +#define DESC_FMT_SENSE_DATA_SIZE 8 + +/* SCSI/NVMe defines and bit masks */ +#define INQ_STANDARD_INQUIRY_PAGE 0x00 +#define INQ_SUPPORTED_VPD_PAGES_PAGE 0x00 +#define INQ_UNIT_SERIAL_NUMBER_PAGE 0x80 +#define INQ_DEVICE_IDENTIFICATION_PAGE 0x83 +#define INQ_EXTENDED_INQUIRY_DATA_PAGE 0x86 +#define INQ_BDEV_CHARACTERISTICS_PAGE 0xB1 +#define INQ_SERIAL_NUMBER_LENGTH 0x14 +#define INQ_NUM_SUPPORTED_VPD_PAGES 5 +#define VERSION_SPC_4 0x06 +#define ACA_UNSUPPORTED 0 +#define STANDARD_INQUIRY_LENGTH 36 +#define ADDITIONAL_STD_INQ_LENGTH 31 +#define EXTENDED_INQUIRY_DATA_PAGE_LENGTH 0x3C +#define RESERVED_FIELD 0 + +/* SCSI READ/WRITE Defines */ +#define IO_CDB_WP_MASK 0xE0 +#define IO_CDB_WP_SHIFT 5 +#define IO_CDB_FUA_MASK 0x8 +#define IO_6_CDB_LBA_OFFSET 0 +#define IO_6_CDB_LBA_MASK 0x001FFFFF +#define IO_6_CDB_TX_LEN_OFFSET 4 +#define IO_6_DEFAULT_TX_LEN 256 +#define IO_10_CDB_LBA_OFFSET 2 +#define IO_10_CDB_TX_LEN_OFFSET 7 +#define IO_10_CDB_WP_OFFSET 1 +#define IO_10_CDB_FUA_OFFSET 1 +#define IO_12_CDB_LBA_OFFSET 2 +#define IO_12_CDB_TX_LEN_OFFSET 6 +#define IO_12_CDB_WP_OFFSET 1 +#define IO_12_CDB_FUA_OFFSET 1 +#define IO_16_CDB_FUA_OFFSET 1 +#define IO_16_CDB_WP_OFFSET 1 +#define IO_16_CDB_LBA_OFFSET 2 +#define IO_16_CDB_TX_LEN_OFFSET 10 + +/* Mode Sense/Select defines */ +#define MODE_PAGE_INFO_EXCEP 0x1C +#define MODE_PAGE_CACHING 0x08 +#define MODE_PAGE_CONTROL 0x0A +#define MODE_PAGE_POWER_CONDITION 0x1A +#define MODE_PAGE_RETURN_ALL 0x3F +#define MODE_PAGE_BLK_DES_LEN 0x08 +#define MODE_PAGE_LLBAA_BLK_DES_LEN 0x10 +#define MODE_PAGE_CACHING_LEN 0x14 +#define MODE_PAGE_CONTROL_LEN 0x0C +#define MODE_PAGE_POW_CND_LEN 0x28 +#define MODE_PAGE_INF_EXC_LEN 0x0C +#define MODE_PAGE_ALL_LEN 0x54 +#define MODE_SENSE6_MPH_SIZE 4 +#define MODE_SENSE6_ALLOC_LEN_OFFSET 4 +#define MODE_SENSE_PAGE_CONTROL_OFFSET 2 +#define MODE_SENSE_PAGE_CONTROL_MASK 0xC0 +#define MODE_SENSE_PAGE_CODE_OFFSET 2 +#define MODE_SENSE_PAGE_CODE_MASK 0x3F +#define MODE_SENSE_LLBAA_OFFSET 1 +#define MODE_SENSE_LLBAA_MASK 0x10 +#define MODE_SENSE_LLBAA_SHIFT 4 +#define MODE_SENSE_DBD_OFFSET 1 +#define MODE_SENSE_DBD_MASK 8 +#define MODE_SENSE_DBD_SHIFT 3 +#define MODE_SENSE10_MPH_SIZE 8 +#define MODE_SENSE10_ALLOC_LEN_OFFSET 7 +#define MODE_SELECT_CDB_PAGE_FORMAT_OFFSET 1 +#define MODE_SELECT_CDB_SAVE_PAGES_OFFSET 1 +#define MODE_SELECT_6_CDB_PARAM_LIST_LENGTH_OFFSET 4 +#define MODE_SELECT_10_CDB_PARAM_LIST_LENGTH_OFFSET 7 +#define MODE_SELECT_CDB_PAGE_FORMAT_MASK 0x10 +#define MODE_SELECT_CDB_SAVE_PAGES_MASK 0x1 +#define MODE_SELECT_6_BD_OFFSET 3 +#define MODE_SELECT_10_BD_OFFSET 6 +#define MODE_SELECT_10_LLBAA_OFFSET 4 +#define MODE_SELECT_10_LLBAA_MASK 1 +#define MODE_SELECT_6_MPH_SIZE 4 +#define MODE_SELECT_10_MPH_SIZE 8 +#define CACHING_MODE_PAGE_WCE_MASK 0x04 +#define MODE_SENSE_BLK_DESC_ENABLED 0 +#define MODE_SENSE_BLK_DESC_COUNT 1 +#define MODE_SELECT_PAGE_CODE_MASK 0x3F +#define SHORT_DESC_BLOCK 8 +#define LONG_DESC_BLOCK 16 +#define MODE_PAGE_POW_CND_LEN_FIELD 0x26 +#define MODE_PAGE_INF_EXC_LEN_FIELD 0x0A +#define MODE_PAGE_CACHING_LEN_FIELD 0x12 +#define MODE_PAGE_CONTROL_LEN_FIELD 0x0A +#define MODE_SENSE_PC_CURRENT_VALUES 0 + +/* Log Sense defines */ +#define LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE 0x00 +#define LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH 0x07 +#define LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE 0x2F +#define LOG_PAGE_TEMPERATURE_PAGE 0x0D +#define LOG_SENSE_CDB_SP_OFFSET 1 +#define LOG_SENSE_CDB_SP_NOT_ENABLED 0 +#define LOG_SENSE_CDB_PC_OFFSET 2 +#define LOG_SENSE_CDB_PC_MASK 0xC0 +#define LOG_SENSE_CDB_PC_SHIFT 6 +#define LOG_SENSE_CDB_PC_CUMULATIVE_VALUES 1 +#define LOG_SENSE_CDB_PAGE_CODE_MASK 0x3F +#define LOG_SENSE_CDB_ALLOC_LENGTH_OFFSET 7 +#define REMAINING_INFO_EXCP_PAGE_LENGTH 0x8 +#define LOG_INFO_EXCP_PAGE_LENGTH 0xC +#define REMAINING_TEMP_PAGE_LENGTH 0xC +#define LOG_TEMP_PAGE_LENGTH 0x10 +#define LOG_TEMP_UNKNOWN 0xFF +#define SUPPORTED_LOG_PAGES_PAGE_LENGTH 0x3 + +/* Read Capacity defines */ +#define READ_CAP_10_RESP_SIZE 8 +#define READ_CAP_16_RESP_SIZE 32 + +/* NVMe Namespace and Command Defines */ +#define NVME_GET_SMART_LOG_PAGE 0x02 +#define NVME_GET_FEAT_TEMP_THRESH 0x04 +#define BYTES_TO_DWORDS 4 +#define NVME_MAX_FIRMWARE_SLOT 7 + +/* Report LUNs defines */ +#define REPORT_LUNS_FIRST_LUN_OFFSET 8 + +/* SCSI ADDITIONAL SENSE Codes */ + +#define SCSI_ASC_NO_SENSE 0x00 +#define SCSI_ASC_PERIPHERAL_DEV_WRITE_FAULT 0x03 +#define SCSI_ASC_LUN_NOT_READY 0x04 +#define SCSI_ASC_WARNING 0x0B +#define SCSI_ASC_LOG_BLOCK_GUARD_CHECK_FAILED 0x10 +#define SCSI_ASC_LOG_BLOCK_APPTAG_CHECK_FAILED 0x10 +#define SCSI_ASC_LOG_BLOCK_REFTAG_CHECK_FAILED 0x10 +#define SCSI_ASC_UNRECOVERED_READ_ERROR 0x11 +#define SCSI_ASC_MISCOMPARE_DURING_VERIFY 0x1D +#define SCSI_ASC_ACCESS_DENIED_INVALID_LUN_ID 0x20 +#define SCSI_ASC_ILLEGAL_COMMAND 0x20 +#define SCSI_ASC_ILLEGAL_BLOCK 0x21 +#define SCSI_ASC_INVALID_CDB 0x24 +#define SCSI_ASC_INVALID_LUN 0x25 +#define SCSI_ASC_INVALID_PARAMETER 0x26 +#define SCSI_ASC_FORMAT_COMMAND_FAILED 0x31 +#define SCSI_ASC_INTERNAL_TARGET_FAILURE 0x44 + +/* SCSI ADDITIONAL SENSE Code Qualifiers */ + +#define SCSI_ASCQ_CAUSE_NOT_REPORTABLE 0x00 +#define SCSI_ASCQ_FORMAT_COMMAND_FAILED 0x01 +#define SCSI_ASCQ_LOG_BLOCK_GUARD_CHECK_FAILED 0x01 +#define SCSI_ASCQ_LOG_BLOCK_APPTAG_CHECK_FAILED 0x02 +#define SCSI_ASCQ_LOG_BLOCK_REFTAG_CHECK_FAILED 0x03 +#define SCSI_ASCQ_FORMAT_IN_PROGRESS 0x04 +#define SCSI_ASCQ_POWER_LOSS_EXPECTED 0x08 +#define SCSI_ASCQ_INVALID_LUN_ID 0x09 + +/** + * DEVICE_SPECIFIC_PARAMETER in mode parameter header (see sbc2r16) to + * enable DPOFUA support type 0x10 value. + */ +#define DEVICE_SPECIFIC_PARAMETER 0 +#define VPD_ID_DESCRIPTOR_LENGTH sizeof(VPD_IDENTIFICATION_DESCRIPTOR) + +/* MACROs to extract information from CDBs */ + +#define GET_OPCODE(cdb) cdb[0] + +#define GET_U8_FROM_CDB(cdb, index) (cdb[index] << 0) + +#define GET_U16_FROM_CDB(cdb, index) ((cdb[index] << 8) | (cdb[index + 1] << 0)) + +#define GET_U24_FROM_CDB(cdb, index) ((cdb[index] << 16) | \ +(cdb[index + 1] << 8) | \ +(cdb[index + 2] << 0)) + +#define GET_U32_FROM_CDB(cdb, index) ((cdb[index] << 24) | \ +(cdb[index + 1] << 16) | \ +(cdb[index + 2] << 8) | \ +(cdb[index + 3] << 0)) + +#define GET_U64_FROM_CDB(cdb, index) ((((u64)cdb[index]) << 56) | \ +(((u64)cdb[index + 1]) << 48) | \ +(((u64)cdb[index + 2]) << 40) | \ +(((u64)cdb[index + 3]) << 32) | \ +(((u64)cdb[index + 4]) << 24) | \ +(((u64)cdb[index + 5]) << 16) | \ +(((u64)cdb[index + 6]) << 8) | \ +(((u64)cdb[index + 7]) << 0)) + +/* Inquiry Helper Macros */ +#define GET_INQ_EVPD_BIT(cdb) \ +((GET_U8_FROM_CDB(cdb, INQUIRY_EVPD_BYTE_OFFSET) & \ +INQUIRY_EVPD_BIT_MASK) ? 1 : 0) + +#define GET_INQ_PAGE_CODE(cdb) \ +(GET_U8_FROM_CDB(cdb, INQUIRY_PAGE_CODE_BYTE_OFFSET)) + +#define GET_INQ_ALLOC_LENGTH(cdb) \ +(GET_U16_FROM_CDB(cdb, INQUIRY_CDB_ALLOCATION_LENGTH_OFFSET)) + +/* Report LUNs Helper Macros */ +#define GET_REPORT_LUNS_ALLOC_LENGTH(cdb) \ +(GET_U32_FROM_CDB(cdb, REPORT_LUNS_CDB_ALLOC_LENGTH_OFFSET)) + +/* Read Capacity Helper Macros */ +#define GET_READ_CAP_16_ALLOC_LENGTH(cdb) \ +(GET_U32_FROM_CDB(cdb, READ_CAP_16_CDB_ALLOC_LENGTH_OFFSET)) + +#define IS_READ_CAP_16(cdb) \ +((cdb[0] == SERVICE_ACTION_IN && cdb[1] == SAI_READ_CAPACITY_16) ? 1 : 0) + +/* Request Sense Helper Macros */ +#define GET_REQUEST_SENSE_ALLOC_LENGTH(cdb) \ +(GET_U8_FROM_CDB(cdb, REQUEST_SENSE_CDB_ALLOC_LENGTH_OFFSET)) + +/* Mode Sense Helper Macros */ +#define GET_MODE_SENSE_DBD(cdb) \ +((GET_U8_FROM_CDB(cdb, MODE_SENSE_DBD_OFFSET) & MODE_SENSE_DBD_MASK) >> \ +MODE_SENSE_DBD_SHIFT) + +#define GET_MODE_SENSE_LLBAA(cdb) \ +((GET_U8_FROM_CDB(cdb, MODE_SENSE_LLBAA_OFFSET) & \ +MODE_SENSE_LLBAA_MASK) >> MODE_SENSE_LLBAA_SHIFT) + +#define GET_MODE_SENSE_MPH_SIZE(cdb10) \ +(cdb10 ? MODE_SENSE10_MPH_SIZE : MODE_SENSE6_MPH_SIZE) + + +/* Struct to gather data that needs to be extracted from a SCSI CDB. + Not conforming to any particular CDB variant, but compatible with all. */ + +struct nvme_trans_io_cdb { + u8 fua; + u8 prot_info; + u64 lba; + u32 xfer_len; +}; + + +/* Internal Helper Functions */ + + +/* Copy data to userspace memory */ + +static int nvme_trans_copy_to_user(struct sg_io_hdr *hdr, void *from, + unsigned long n) +{ + int res = SNTI_TRANSLATION_SUCCESS; + unsigned long not_copied; + int i; + void *index = from; + size_t remaining = n; + size_t xfer_len; + + if (hdr->iovec_count > 0) { + struct sg_iovec *sgl = hdr->dxferp; + + for (i = 0; i < hdr->iovec_count; i++) { + xfer_len = min(remaining, sgl[i].iov_len); + not_copied = copy_to_user(__user sgl[i].iov_base, index, + xfer_len); + if (not_copied) { + res = -EFAULT; + break; + } + index += xfer_len; + remaining -= xfer_len; + if (remaining == 0) + break; + } + return res; + } + not_copied = copy_to_user(__user hdr->dxferp, from, n); + if (not_copied) + res = -EFAULT; + return res; +} + +/* Copy data from userspace memory */ + +static int nvme_trans_copy_from_user(struct sg_io_hdr *hdr, void *to, + unsigned long n) +{ + int res = SNTI_TRANSLATION_SUCCESS; + unsigned long not_copied; + int i; + void *index = to; + size_t remaining = n; + size_t xfer_len; + + if (hdr->iovec_count > 0) { + struct sg_iovec *sgl = hdr->dxferp; + + for (i = 0; i < hdr->iovec_count; i++) { + xfer_len = min(remaining, sgl[i].iov_len); + not_copied = copy_from_user(index, + __user sgl[i].iov_base, xfer_len); + if (not_copied) { + res = -EFAULT; + break; + } + index += xfer_len; + remaining -= xfer_len; + if (remaining == 0) + break; + } + return res; + } + + not_copied = copy_from_user(to, __user hdr->dxferp, n); + if (not_copied) + res = -EFAULT; + return res; +} + +/* Status/Sense Buffer Writeback */ + +static int nvme_trans_completion(struct sg_io_hdr *hdr, u8 status, u8 sense_key, + u8 asc, u8 ascq) +{ + int res = SNTI_TRANSLATION_SUCCESS; + u8 xfer_len; + u8 resp[DESC_FMT_SENSE_DATA_SIZE]; + + if (scsi_status_is_good(status)) { + hdr->status = SAM_STAT_GOOD; + hdr->masked_status = GOOD; + hdr->host_status = DID_OK; + hdr->driver_status = DRIVER_OK; + hdr->sb_len_wr = 0; + } else { + hdr->status = status; + hdr->masked_status = status >> 1; + hdr->host_status = DID_OK; + hdr->driver_status = DRIVER_OK; + + memset(resp, 0, DESC_FMT_SENSE_DATA_SIZE); + resp[0] = DESC_FORMAT_SENSE_DATA; + resp[1] = sense_key; + resp[2] = asc; + resp[3] = ascq; + + xfer_len = min_t(u8, hdr->mx_sb_len, DESC_FMT_SENSE_DATA_SIZE); + hdr->sb_len_wr = xfer_len; + if (copy_to_user(__user hdr->sbp, resp, xfer_len) > 0) + res = -EFAULT; + } + + return res; +} + +static int nvme_trans_status_code(struct sg_io_hdr *hdr, int nvme_sc) +{ + u8 status, sense_key, asc, ascq; + int res = SNTI_TRANSLATION_SUCCESS; + + /* For non-nvme (Linux) errors, simply return the error code */ + if (nvme_sc < 0) + return nvme_sc; + + /* Mask DNR, More, and reserved fields */ + nvme_sc &= 0x7FF; + + switch (nvme_sc) { + /* Generic Command Status */ + case NVME_SC_SUCCESS: + status = SAM_STAT_GOOD; + sense_key = NO_SENSE; + asc = SCSI_ASC_NO_SENSE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_INVALID_OPCODE: + status = SAM_STAT_CHECK_CONDITION; + sense_key = ILLEGAL_REQUEST; + asc = SCSI_ASC_ILLEGAL_COMMAND; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_INVALID_FIELD: + status = SAM_STAT_CHECK_CONDITION; + sense_key = ILLEGAL_REQUEST; + asc = SCSI_ASC_INVALID_CDB; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_DATA_XFER_ERROR: + status = SAM_STAT_CHECK_CONDITION; + sense_key = MEDIUM_ERROR; + asc = SCSI_ASC_NO_SENSE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_POWER_LOSS: + status = SAM_STAT_TASK_ABORTED; + sense_key = ABORTED_COMMAND; + asc = SCSI_ASC_WARNING; + ascq = SCSI_ASCQ_POWER_LOSS_EXPECTED; + break; + case NVME_SC_INTERNAL: + status = SAM_STAT_CHECK_CONDITION; + sense_key = HARDWARE_ERROR; + asc = SCSI_ASC_INTERNAL_TARGET_FAILURE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_ABORT_REQ: + status = SAM_STAT_TASK_ABORTED; + sense_key = ABORTED_COMMAND; + asc = SCSI_ASC_NO_SENSE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_ABORT_QUEUE: + status = SAM_STAT_TASK_ABORTED; + sense_key = ABORTED_COMMAND; + asc = SCSI_ASC_NO_SENSE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_FUSED_FAIL: + status = SAM_STAT_TASK_ABORTED; + sense_key = ABORTED_COMMAND; + asc = SCSI_ASC_NO_SENSE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_FUSED_MISSING: + status = SAM_STAT_TASK_ABORTED; + sense_key = ABORTED_COMMAND; + asc = SCSI_ASC_NO_SENSE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_INVALID_NS: + status = SAM_STAT_CHECK_CONDITION; + sense_key = ILLEGAL_REQUEST; + asc = SCSI_ASC_ACCESS_DENIED_INVALID_LUN_ID; + ascq = SCSI_ASCQ_INVALID_LUN_ID; + break; + case NVME_SC_LBA_RANGE: + status = SAM_STAT_CHECK_CONDITION; + sense_key = ILLEGAL_REQUEST; + asc = SCSI_ASC_ILLEGAL_BLOCK; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_CAP_EXCEEDED: + status = SAM_STAT_CHECK_CONDITION; + sense_key = MEDIUM_ERROR; + asc = SCSI_ASC_NO_SENSE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_NS_NOT_READY: + status = SAM_STAT_CHECK_CONDITION; + sense_key = NOT_READY; + asc = SCSI_ASC_LUN_NOT_READY; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + + /* Command Specific Status */ + case NVME_SC_INVALID_FORMAT: + status = SAM_STAT_CHECK_CONDITION; + sense_key = ILLEGAL_REQUEST; + asc = SCSI_ASC_FORMAT_COMMAND_FAILED; + ascq = SCSI_ASCQ_FORMAT_COMMAND_FAILED; + break; + case NVME_SC_BAD_ATTRIBUTES: + status = SAM_STAT_CHECK_CONDITION; + sense_key = ILLEGAL_REQUEST; + asc = SCSI_ASC_INVALID_CDB; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + + /* Media Errors */ + case NVME_SC_WRITE_FAULT: + status = SAM_STAT_CHECK_CONDITION; + sense_key = MEDIUM_ERROR; + asc = SCSI_ASC_PERIPHERAL_DEV_WRITE_FAULT; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_READ_ERROR: + status = SAM_STAT_CHECK_CONDITION; + sense_key = MEDIUM_ERROR; + asc = SCSI_ASC_UNRECOVERED_READ_ERROR; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_GUARD_CHECK: + status = SAM_STAT_CHECK_CONDITION; + sense_key = MEDIUM_ERROR; + asc = SCSI_ASC_LOG_BLOCK_GUARD_CHECK_FAILED; + ascq = SCSI_ASCQ_LOG_BLOCK_GUARD_CHECK_FAILED; + break; + case NVME_SC_APPTAG_CHECK: + status = SAM_STAT_CHECK_CONDITION; + sense_key = MEDIUM_ERROR; + asc = SCSI_ASC_LOG_BLOCK_APPTAG_CHECK_FAILED; + ascq = SCSI_ASCQ_LOG_BLOCK_APPTAG_CHECK_FAILED; + break; + case NVME_SC_REFTAG_CHECK: + status = SAM_STAT_CHECK_CONDITION; + sense_key = MEDIUM_ERROR; + asc = SCSI_ASC_LOG_BLOCK_REFTAG_CHECK_FAILED; + ascq = SCSI_ASCQ_LOG_BLOCK_REFTAG_CHECK_FAILED; + break; + case NVME_SC_COMPARE_FAILED: + status = SAM_STAT_CHECK_CONDITION; + sense_key = MISCOMPARE; + asc = SCSI_ASC_MISCOMPARE_DURING_VERIFY; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_ACCESS_DENIED: + status = SAM_STAT_CHECK_CONDITION; + sense_key = ILLEGAL_REQUEST; + asc = SCSI_ASC_ACCESS_DENIED_INVALID_LUN_ID; + ascq = SCSI_ASCQ_INVALID_LUN_ID; + break; + + /* Unspecified/Default */ + case NVME_SC_CMDID_CONFLICT: + case NVME_SC_CMD_SEQ_ERROR: + case NVME_SC_CQ_INVALID: + case NVME_SC_QID_INVALID: + case NVME_SC_QUEUE_SIZE: + case NVME_SC_ABORT_LIMIT: + case NVME_SC_ABORT_MISSING: + case NVME_SC_ASYNC_LIMIT: + case NVME_SC_FIRMWARE_SLOT: + case NVME_SC_FIRMWARE_IMAGE: + case NVME_SC_INVALID_VECTOR: + case NVME_SC_INVALID_LOG_PAGE: + default: + status = SAM_STAT_CHECK_CONDITION; + sense_key = ILLEGAL_REQUEST; + asc = SCSI_ASC_NO_SENSE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + } + + res = nvme_trans_completion(hdr, status, sense_key, asc, ascq); + + return res; +} + +/* INQUIRY Helper Functions */ + +static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns, + struct sg_io_hdr *hdr, u8 *inq_response, + int alloc_len) +{ + struct nvme_dev *dev = ns->dev; + dma_addr_t dma_addr; + void *mem; + struct nvme_id_ns *id_ns; + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + int xfer_len; + u8 resp_data_format = 0x02; + u8 protect; + u8 cmdque = 0x01 << 1; + + mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), + &dma_addr, GFP_KERNEL); + if (mem == NULL) { + res = -ENOMEM; + goto out_dma; + } + + /* nvme ns identify - use DPS value for PROTECT field */ + nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); + res = nvme_trans_status_code(hdr, nvme_sc); + /* + * If nvme_sc was -ve, res will be -ve here. + * If nvme_sc was +ve, the status would bace been translated, and res + * can only be 0 or -ve. + * - If 0 && nvme_sc > 0, then go into next if where res gets nvme_sc + * - If -ve, return because its a Linux error. + */ + if (res) + goto out_free; + if (nvme_sc) { + res = nvme_sc; + goto out_free; + } + id_ns = mem; + (id_ns->dps) ? (protect = 0x01) : (protect = 0); + + memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); + inq_response[2] = VERSION_SPC_4; + inq_response[3] = resp_data_format; /*normaca=0 | hisup=0 */ + inq_response[4] = ADDITIONAL_STD_INQ_LENGTH; + inq_response[5] = protect; /* sccs=0 | acc=0 | tpgs=0 | pc3=0 */ + inq_response[7] = cmdque; /* wbus16=0 | sync=0 | vs=0 */ + strncpy(&inq_response[8], "NVMe ", 8); + strncpy(&inq_response[16], dev->model, 16); + strncpy(&inq_response[32], dev->firmware_rev, 4); + + xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); + res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); + + out_free: + dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem, + dma_addr); + out_dma: + return res; +} + +static int nvme_trans_supported_vpd_pages(struct nvme_ns *ns, + struct sg_io_hdr *hdr, u8 *inq_response, + int alloc_len) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int xfer_len; + + memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); + inq_response[1] = INQ_SUPPORTED_VPD_PAGES_PAGE; /* Page Code */ + inq_response[3] = INQ_NUM_SUPPORTED_VPD_PAGES; /* Page Length */ + inq_response[4] = INQ_SUPPORTED_VPD_PAGES_PAGE; + inq_response[5] = INQ_UNIT_SERIAL_NUMBER_PAGE; + inq_response[6] = INQ_DEVICE_IDENTIFICATION_PAGE; + inq_response[7] = INQ_EXTENDED_INQUIRY_DATA_PAGE; + inq_response[8] = INQ_BDEV_CHARACTERISTICS_PAGE; + + xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); + res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); + + return res; +} + +static int nvme_trans_unit_serial_page(struct nvme_ns *ns, + struct sg_io_hdr *hdr, u8 *inq_response, + int alloc_len) +{ + struct nvme_dev *dev = ns->dev; + int res = SNTI_TRANSLATION_SUCCESS; + int xfer_len; + + memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); + inq_response[1] = INQ_UNIT_SERIAL_NUMBER_PAGE; /* Page Code */ + inq_response[3] = INQ_SERIAL_NUMBER_LENGTH; /* Page Length */ + strncpy(&inq_response[4], dev->serial, INQ_SERIAL_NUMBER_LENGTH); + + xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); + res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); + + return res; +} + +static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *inq_response, int alloc_len) +{ + struct nvme_dev *dev = ns->dev; + dma_addr_t dma_addr; + void *mem; + struct nvme_id_ctrl *id_ctrl; + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + u8 ieee[4]; + int xfer_len; + u32 tmp_id = cpu_to_be64(ns->ns_id); + + mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), + &dma_addr, GFP_KERNEL); + if (mem == NULL) { + res = -ENOMEM; + goto out_dma; + } + + /* nvme controller identify */ + nvme_sc = nvme_identify(dev, 0, 1, dma_addr); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_free; + if (nvme_sc) { + res = nvme_sc; + goto out_free; + } + id_ctrl = mem; + + /* Since SCSI tried to save 4 bits... [SPC-4(r34) Table 591] */ + ieee[0] = id_ctrl->ieee[0] << 4; + ieee[1] = id_ctrl->ieee[0] >> 4 | id_ctrl->ieee[1] << 4; + ieee[2] = id_ctrl->ieee[1] >> 4 | id_ctrl->ieee[2] << 4; + ieee[3] = id_ctrl->ieee[2] >> 4; + + memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); + inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE; /* Page Code */ + inq_response[3] = 20; /* Page Length */ + /* Designation Descriptor start */ + inq_response[4] = 0x01; /* Proto ID=0h | Code set=1h */ + inq_response[5] = 0x03; /* PIV=0b | Asso=00b | Designator Type=3h */ + inq_response[6] = 0x00; /* Rsvd */ + inq_response[7] = 16; /* Designator Length */ + /* Designator start */ + inq_response[8] = 0x60 | ieee[3]; /* NAA=6h | IEEE ID MSB, High nibble*/ + inq_response[9] = ieee[2]; /* IEEE ID */ + inq_response[10] = ieee[1]; /* IEEE ID */ + inq_response[11] = ieee[0]; /* IEEE ID| Vendor Specific ID... */ + inq_response[12] = (dev->pci_dev->vendor & 0xFF00) >> 8; + inq_response[13] = (dev->pci_dev->vendor & 0x00FF); + inq_response[14] = dev->serial[0]; + inq_response[15] = dev->serial[1]; + inq_response[16] = dev->model[0]; + inq_response[17] = dev->model[1]; + memcpy(&inq_response[18], &tmp_id, sizeof(u32)); + /* Last 2 bytes are zero */ + + xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); + res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); + + out_free: + dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem, + dma_addr); + out_dma: + return res; +} + +static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, + int alloc_len) +{ + u8 *inq_response; + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + struct nvme_dev *dev = ns->dev; + dma_addr_t dma_addr; + void *mem; + struct nvme_id_ctrl *id_ctrl; + struct nvme_id_ns *id_ns; + int xfer_len; + u8 microcode = 0x80; + u8 spt; + u8 spt_lut[8] = {0, 0, 2, 1, 4, 6, 5, 7}; + u8 grd_chk, app_chk, ref_chk, protect; + u8 uask_sup = 0x20; + u8 v_sup; + u8 luiclr = 0x01; + + inq_response = kmalloc(EXTENDED_INQUIRY_DATA_PAGE_LENGTH, GFP_KERNEL); + if (inq_response == NULL) { + res = -ENOMEM; + goto out_mem; + } + + mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), + &dma_addr, GFP_KERNEL); + if (mem == NULL) { + res = -ENOMEM; + goto out_dma; + } + + /* nvme ns identify */ + nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_free; + if (nvme_sc) { + res = nvme_sc; + goto out_free; + } + id_ns = mem; + spt = spt_lut[(id_ns->dpc) & 0x07] << 3; + (id_ns->dps) ? (protect = 0x01) : (protect = 0); + grd_chk = protect << 2; + app_chk = protect << 1; + ref_chk = protect; + + /* nvme controller identify */ + nvme_sc = nvme_identify(dev, 0, 1, dma_addr); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_free; + if (nvme_sc) { + res = nvme_sc; + goto out_free; + } + id_ctrl = mem; + v_sup = id_ctrl->vwc; + + memset(inq_response, 0, EXTENDED_INQUIRY_DATA_PAGE_LENGTH); + inq_response[1] = INQ_EXTENDED_INQUIRY_DATA_PAGE; /* Page Code */ + inq_response[2] = 0x00; /* Page Length MSB */ + inq_response[3] = 0x3C; /* Page Length LSB */ + inq_response[4] = microcode | spt | grd_chk | app_chk | ref_chk; + inq_response[5] = uask_sup; + inq_response[6] = v_sup; + inq_response[7] = luiclr; + inq_response[8] = 0; + inq_response[9] = 0; + + xfer_len = min(alloc_len, EXTENDED_INQUIRY_DATA_PAGE_LENGTH); + res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); + + out_free: + dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem, + dma_addr); + out_dma: + kfree(inq_response); + out_mem: + return res; +} + +static int nvme_trans_bdev_char_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, + int alloc_len) +{ + u8 *inq_response; + int res = SNTI_TRANSLATION_SUCCESS; + int xfer_len; + + inq_response = kmalloc(EXTENDED_INQUIRY_DATA_PAGE_LENGTH, GFP_KERNEL); + if (inq_response == NULL) { + res = -ENOMEM; + goto out_mem; + } + + memset(inq_response, 0, EXTENDED_INQUIRY_DATA_PAGE_LENGTH); + inq_response[1] = INQ_BDEV_CHARACTERISTICS_PAGE; /* Page Code */ + inq_response[2] = 0x00; /* Page Length MSB */ + inq_response[3] = 0x3C; /* Page Length LSB */ + inq_response[4] = 0x00; /* Medium Rotation Rate MSB */ + inq_response[5] = 0x01; /* Medium Rotation Rate LSB */ + inq_response[6] = 0x00; /* Form Factor */ + + xfer_len = min(alloc_len, EXTENDED_INQUIRY_DATA_PAGE_LENGTH); + res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); + + kfree(inq_response); + out_mem: + return res; +} + +/* LOG SENSE Helper Functions */ + +static int nvme_trans_log_supp_pages(struct nvme_ns *ns, struct sg_io_hdr *hdr, + int alloc_len) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int xfer_len; + u8 *log_response; + + log_response = kmalloc(LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH, GFP_KERNEL); + if (log_response == NULL) { + res = -ENOMEM; + goto out_mem; + } + memset(log_response, 0, LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH); + + log_response[0] = LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE; + /* Subpage=0x00, Page Length MSB=0 */ + log_response[3] = SUPPORTED_LOG_PAGES_PAGE_LENGTH; + log_response[4] = LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE; + log_response[5] = LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE; + log_response[6] = LOG_PAGE_TEMPERATURE_PAGE; + + xfer_len = min(alloc_len, LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH); + res = nvme_trans_copy_to_user(hdr, log_response, xfer_len); + + kfree(log_response); + out_mem: + return res; +} + +static int nvme_trans_log_info_exceptions(struct nvme_ns *ns, + struct sg_io_hdr *hdr, int alloc_len) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int xfer_len; + u8 *log_response; + struct nvme_command c; + struct nvme_dev *dev = ns->dev; + struct nvme_smart_log *smart_log; + dma_addr_t dma_addr; + void *mem; + u8 temp_c; + u16 temp_k; + + log_response = kmalloc(LOG_INFO_EXCP_PAGE_LENGTH, GFP_KERNEL); + if (log_response == NULL) { + res = -ENOMEM; + goto out_mem; + } + memset(log_response, 0, LOG_INFO_EXCP_PAGE_LENGTH); + + mem = dma_alloc_coherent(&dev->pci_dev->dev, + sizeof(struct nvme_smart_log), + &dma_addr, GFP_KERNEL); + if (mem == NULL) { + res = -ENOMEM; + goto out_dma; + } + + /* Get SMART Log Page */ + memset(&c, 0, sizeof(c)); + c.common.opcode = nvme_admin_get_log_page; + c.common.nsid = cpu_to_le32(0xFFFFFFFF); + c.common.prp1 = cpu_to_le64(dma_addr); + c.common.cdw10[0] = cpu_to_le32(((sizeof(struct nvme_smart_log) / + BYTES_TO_DWORDS) << 16) | NVME_GET_SMART_LOG_PAGE); + res = nvme_submit_admin_cmd(dev, &c, NULL); + if (res != NVME_SC_SUCCESS) { + temp_c = LOG_TEMP_UNKNOWN; + } else { + smart_log = mem; + temp_k = (smart_log->temperature[1] << 8) + + (smart_log->temperature[0]); + temp_c = temp_k - KELVIN_TEMP_FACTOR; + } + + log_response[0] = LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE; + /* Subpage=0x00, Page Length MSB=0 */ + log_response[3] = REMAINING_INFO_EXCP_PAGE_LENGTH; + /* Informational Exceptions Log Parameter 1 Start */ + /* Parameter Code=0x0000 bytes 4,5 */ + log_response[6] = 0x23; /* DU=0, TSD=1, ETC=0, TMC=0, FMT_AND_LNK=11b */ + log_response[7] = 0x04; /* PARAMETER LENGTH */ + /* Add sense Code and qualifier = 0x00 each */ + /* Use Temperature from NVMe Get Log Page, convert to C from K */ + log_response[10] = temp_c; + + xfer_len = min(alloc_len, LOG_INFO_EXCP_PAGE_LENGTH); + res = nvme_trans_copy_to_user(hdr, log_response, xfer_len); + + dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_smart_log), + mem, dma_addr); + out_dma: + kfree(log_response); + out_mem: + return res; +} + +static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr, + int alloc_len) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int xfer_len; + u8 *log_response; + struct nvme_command c; + struct nvme_dev *dev = ns->dev; + struct nvme_smart_log *smart_log; + dma_addr_t dma_addr; + void *mem; + u32 feature_resp; + u8 temp_c_cur, temp_c_thresh; + u16 temp_k; + + log_response = kmalloc(LOG_TEMP_PAGE_LENGTH, GFP_KERNEL); + if (log_response == NULL) { + res = -ENOMEM; + goto out_mem; + } + memset(log_response, 0, LOG_TEMP_PAGE_LENGTH); + + mem = dma_alloc_coherent(&dev->pci_dev->dev, + sizeof(struct nvme_smart_log), + &dma_addr, GFP_KERNEL); + if (mem == NULL) { + res = -ENOMEM; + goto out_dma; + } + + /* Get SMART Log Page */ + memset(&c, 0, sizeof(c)); + c.common.opcode = nvme_admin_get_log_page; + c.common.nsid = cpu_to_le32(0xFFFFFFFF); + c.common.prp1 = cpu_to_le64(dma_addr); + c.common.cdw10[0] = cpu_to_le32(((sizeof(struct nvme_smart_log) / + BYTES_TO_DWORDS) << 16) | NVME_GET_SMART_LOG_PAGE); + res = nvme_submit_admin_cmd(dev, &c, NULL); + if (res != NVME_SC_SUCCESS) { + temp_c_cur = LOG_TEMP_UNKNOWN; + } else { + smart_log = mem; + temp_k = (smart_log->temperature[1] << 8) + + (smart_log->temperature[0]); + temp_c_cur = temp_k - KELVIN_TEMP_FACTOR; + } + + /* Get Features for Temp Threshold */ + res = nvme_get_features(dev, NVME_FEAT_TEMP_THRESH, 0, 0, + &feature_resp); + if (res != NVME_SC_SUCCESS) + temp_c_thresh = LOG_TEMP_UNKNOWN; + else + temp_c_thresh = (feature_resp & 0xFFFF) - KELVIN_TEMP_FACTOR; + + log_response[0] = LOG_PAGE_TEMPERATURE_PAGE; + /* Subpage=0x00, Page Length MSB=0 */ + log_response[3] = REMAINING_TEMP_PAGE_LENGTH; + /* Temperature Log Parameter 1 (Temperature) Start */ + /* Parameter Code = 0x0000 */ + log_response[6] = 0x01; /* Format and Linking = 01b */ + log_response[7] = 0x02; /* Parameter Length */ + /* Use Temperature from NVMe Get Log Page, convert to C from K */ + log_response[9] = temp_c_cur; + /* Temperature Log Parameter 2 (Reference Temperature) Start */ + log_response[11] = 0x01; /* Parameter Code = 0x0001 */ + log_response[12] = 0x01; /* Format and Linking = 01b */ + log_response[13] = 0x02; /* Parameter Length */ + /* Use Temperature Thresh from NVMe Get Log Page, convert to C from K */ + log_response[15] = temp_c_thresh; + + xfer_len = min(alloc_len, LOG_TEMP_PAGE_LENGTH); + res = nvme_trans_copy_to_user(hdr, log_response, xfer_len); + + dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_smart_log), + mem, dma_addr); + out_dma: + kfree(log_response); + out_mem: + return res; +} + +/* MODE SENSE Helper Functions */ + +static int nvme_trans_fill_mode_parm_hdr(u8 *resp, int len, u8 cdb10, u8 llbaa, + u16 mode_data_length, u16 blk_desc_len) +{ + /* Quick check to make sure I don't stomp on my own memory... */ + if ((cdb10 && len < 8) || (!cdb10 && len < 4)) + return SNTI_INTERNAL_ERROR; + + if (cdb10) { + resp[0] = (mode_data_length & 0xFF00) >> 8; + resp[1] = (mode_data_length & 0x00FF); + /* resp[2] and [3] are zero */ + resp[4] = llbaa; + resp[5] = RESERVED_FIELD; + resp[6] = (blk_desc_len & 0xFF00) >> 8; + resp[7] = (blk_desc_len & 0x00FF); + } else { + resp[0] = (mode_data_length & 0x00FF); + /* resp[1] and [2] are zero */ + resp[3] = (blk_desc_len & 0x00FF); + } + + return SNTI_TRANSLATION_SUCCESS; +} + +static int nvme_trans_fill_blk_desc(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *resp, int len, u8 llbaa) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + struct nvme_dev *dev = ns->dev; + dma_addr_t dma_addr; + void *mem; + struct nvme_id_ns *id_ns; + u8 flbas; + u32 lba_length; + + if (llbaa == 0 && len < MODE_PAGE_BLK_DES_LEN) + return SNTI_INTERNAL_ERROR; + else if (llbaa > 0 && len < MODE_PAGE_LLBAA_BLK_DES_LEN) + return SNTI_INTERNAL_ERROR; + + mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), + &dma_addr, GFP_KERNEL); + if (mem == NULL) { + res = -ENOMEM; + goto out; + } + + /* nvme ns identify */ + nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_dma; + if (nvme_sc) { + res = nvme_sc; + goto out_dma; + } + id_ns = mem; + flbas = (id_ns->flbas) & 0x0F; + lba_length = (1 << (id_ns->lbaf[flbas].ds)); + + if (llbaa == 0) { + u32 tmp_cap = cpu_to_be32(id_ns->ncap); + /* Byte 4 is reserved */ + u32 tmp_len = cpu_to_be32(lba_length) & 0x00FFFFFF; + + memcpy(resp, &tmp_cap, sizeof(u32)); + memcpy(&resp[4], &tmp_len, sizeof(u32)); + } else { + u64 tmp_cap = cpu_to_be64(id_ns->ncap); + u32 tmp_len = cpu_to_be32(lba_length); + + memcpy(resp, &tmp_cap, sizeof(u64)); + /* Bytes 8, 9, 10, 11 are reserved */ + memcpy(&resp[12], &tmp_len, sizeof(u32)); + } + + out_dma: + dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem, + dma_addr); + out: + return res; +} + +static int nvme_trans_fill_control_page(struct nvme_ns *ns, + struct sg_io_hdr *hdr, u8 *resp, + int len) +{ + if (len < MODE_PAGE_CONTROL_LEN) + return SNTI_INTERNAL_ERROR; + + resp[0] = MODE_PAGE_CONTROL; + resp[1] = MODE_PAGE_CONTROL_LEN_FIELD; + resp[2] = 0x0E; /* TST=000b, TMF_ONLY=0, DPICZ=1, + * D_SENSE=1, GLTSD=1, RLEC=0 */ + resp[3] = 0x12; /* Q_ALGO_MODIFIER=1h, NUAR=0, QERR=01b */ + /* Byte 4: VS=0, RAC=0, UA_INT=0, SWP=0 */ + resp[5] = 0x40; /* ATO=0, TAS=1, ATMPE=0, RWWP=0, AUTOLOAD=0 */ + /* resp[6] and [7] are obsolete, thus zero */ + resp[8] = 0xFF; /* Busy timeout period = 0xffff */ + resp[9] = 0xFF; + /* Bytes 10,11: Extended selftest completion time = 0x0000 */ + + return SNTI_TRANSLATION_SUCCESS; +} + +static int nvme_trans_fill_caching_page(struct nvme_ns *ns, + struct sg_io_hdr *hdr, + u8 *resp, int len) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + struct nvme_dev *dev = ns->dev; + u32 feature_resp; + u8 vwc; + + if (len < MODE_PAGE_CACHING_LEN) + return SNTI_INTERNAL_ERROR; + + nvme_sc = nvme_get_features(dev, NVME_FEAT_VOLATILE_WC, 0, 0, + &feature_resp); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out; + if (nvme_sc) { + res = nvme_sc; + goto out; + } + vwc = feature_resp & 0x00000001; + + resp[0] = MODE_PAGE_CACHING; + resp[1] = MODE_PAGE_CACHING_LEN_FIELD; + resp[2] = vwc << 2; + + out: + return res; +} + +static int nvme_trans_fill_pow_cnd_page(struct nvme_ns *ns, + struct sg_io_hdr *hdr, u8 *resp, + int len) +{ + int res = SNTI_TRANSLATION_SUCCESS; + + if (len < MODE_PAGE_POW_CND_LEN) + return SNTI_INTERNAL_ERROR; + + resp[0] = MODE_PAGE_POWER_CONDITION; + resp[1] = MODE_PAGE_POW_CND_LEN_FIELD; + /* All other bytes are zero */ + + return res; +} + +static int nvme_trans_fill_inf_exc_page(struct nvme_ns *ns, + struct sg_io_hdr *hdr, u8 *resp, + int len) +{ + int res = SNTI_TRANSLATION_SUCCESS; + + if (len < MODE_PAGE_INF_EXC_LEN) + return SNTI_INTERNAL_ERROR; + + resp[0] = MODE_PAGE_INFO_EXCEP; + resp[1] = MODE_PAGE_INF_EXC_LEN_FIELD; + resp[2] = 0x88; + /* All other bytes are zero */ + + return res; +} + +static int nvme_trans_fill_all_pages(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *resp, int len) +{ + int res = SNTI_TRANSLATION_SUCCESS; + u16 mode_pages_offset_1 = 0; + u16 mode_pages_offset_2, mode_pages_offset_3, mode_pages_offset_4; + + mode_pages_offset_2 = mode_pages_offset_1 + MODE_PAGE_CACHING_LEN; + mode_pages_offset_3 = mode_pages_offset_2 + MODE_PAGE_CONTROL_LEN; + mode_pages_offset_4 = mode_pages_offset_3 + MODE_PAGE_POW_CND_LEN; + + res = nvme_trans_fill_caching_page(ns, hdr, &resp[mode_pages_offset_1], + MODE_PAGE_CACHING_LEN); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out; + res = nvme_trans_fill_control_page(ns, hdr, &resp[mode_pages_offset_2], + MODE_PAGE_CONTROL_LEN); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out; + res = nvme_trans_fill_pow_cnd_page(ns, hdr, &resp[mode_pages_offset_3], + MODE_PAGE_POW_CND_LEN); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out; + res = nvme_trans_fill_inf_exc_page(ns, hdr, &resp[mode_pages_offset_4], + MODE_PAGE_INF_EXC_LEN); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out; + + out: + return res; +} + +static inline int nvme_trans_get_blk_desc_len(u8 dbd, u8 llbaa) +{ + if (dbd == MODE_SENSE_BLK_DESC_ENABLED) { + /* SPC-4: len = 8 x Num_of_descriptors if llbaa = 0, 16x if 1 */ + return 8 * (llbaa + 1) * MODE_SENSE_BLK_DESC_COUNT; + } else { + return 0; + } +} + +static int nvme_trans_mode_page_create(struct nvme_ns *ns, + struct sg_io_hdr *hdr, u8 *cmd, + u16 alloc_len, u8 cdb10, + int (*mode_page_fill_func) + (struct nvme_ns *, + struct sg_io_hdr *hdr, u8 *, int), + u16 mode_pages_tot_len) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int xfer_len; + u8 *response; + u8 dbd, llbaa; + u16 resp_size; + int mph_size; + u16 mode_pages_offset_1; + u16 blk_desc_len, blk_desc_offset, mode_data_length; + + dbd = GET_MODE_SENSE_DBD(cmd); + llbaa = GET_MODE_SENSE_LLBAA(cmd); + mph_size = GET_MODE_SENSE_MPH_SIZE(cdb10); + blk_desc_len = nvme_trans_get_blk_desc_len(dbd, llbaa); + + resp_size = mph_size + blk_desc_len + mode_pages_tot_len; + /* Refer spc4r34 Table 440 for calculation of Mode data Length field */ + mode_data_length = 3 + (3 * cdb10) + blk_desc_len + mode_pages_tot_len; + + blk_desc_offset = mph_size; + mode_pages_offset_1 = blk_desc_offset + blk_desc_len; + + response = kmalloc(resp_size, GFP_KERNEL); + if (response == NULL) { + res = -ENOMEM; + goto out_mem; + } + memset(response, 0, resp_size); + + res = nvme_trans_fill_mode_parm_hdr(&response[0], mph_size, cdb10, + llbaa, mode_data_length, blk_desc_len); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out_free; + if (blk_desc_len > 0) { + res = nvme_trans_fill_blk_desc(ns, hdr, + &response[blk_desc_offset], + blk_desc_len, llbaa); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out_free; + } + res = mode_page_fill_func(ns, hdr, &response[mode_pages_offset_1], + mode_pages_tot_len); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out_free; + + xfer_len = min(alloc_len, resp_size); + res = nvme_trans_copy_to_user(hdr, response, xfer_len); + + out_free: + kfree(response); + out_mem: + return res; +} + +/* Read Capacity Helper Functions */ + +static void nvme_trans_fill_read_cap(u8 *response, struct nvme_id_ns *id_ns, + u8 cdb16) +{ + u8 flbas; + u32 lba_length; + u64 rlba; + u8 prot_en; + u8 p_type_lut[4] = {0, 0, 1, 2}; + u64 tmp_rlba; + u32 tmp_rlba_32; + u32 tmp_len; + + flbas = (id_ns->flbas) & 0x0F; + lba_length = (1 << (id_ns->lbaf[flbas].ds)); + rlba = le64_to_cpup(&id_ns->nsze) - 1; + (id_ns->dps) ? (prot_en = 0x01) : (prot_en = 0); + + if (!cdb16) { + if (rlba > 0xFFFFFFFF) + rlba = 0xFFFFFFFF; + tmp_rlba_32 = cpu_to_be32(rlba); + tmp_len = cpu_to_be32(lba_length); + memcpy(response, &tmp_rlba_32, sizeof(u32)); + memcpy(&response[4], &tmp_len, sizeof(u32)); + } else { + tmp_rlba = cpu_to_be64(rlba); + tmp_len = cpu_to_be32(lba_length); + memcpy(response, &tmp_rlba, sizeof(u64)); + memcpy(&response[8], &tmp_len, sizeof(u32)); + response[12] = (p_type_lut[id_ns->dps & 0x3] << 1) | prot_en; + /* P_I_Exponent = 0x0 | LBPPBE = 0x0 */ + /* LBPME = 0 | LBPRZ = 0 | LALBA = 0x00 */ + /* Bytes 16-31 - Reserved */ + } +} + +/* Start Stop Unit Helper Functions */ + +static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 pc, u8 pcmod, u8 start) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + struct nvme_dev *dev = ns->dev; + dma_addr_t dma_addr; + void *mem; + struct nvme_id_ctrl *id_ctrl; + int lowest_pow_st; /* max npss = lowest power consumption */ + unsigned ps_desired = 0; + + /* NVMe Controller Identify */ + mem = dma_alloc_coherent(&dev->pci_dev->dev, + sizeof(struct nvme_id_ctrl), + &dma_addr, GFP_KERNEL); + if (mem == NULL) { + res = -ENOMEM; + goto out; + } + nvme_sc = nvme_identify(dev, 0, 1, dma_addr); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_dma; + if (nvme_sc) { + res = nvme_sc; + goto out_dma; + } + id_ctrl = mem; + lowest_pow_st = id_ctrl->npss - 1; + + switch (pc) { + case NVME_POWER_STATE_START_VALID: + /* Action unspecified if POWER CONDITION MODIFIER != 0 */ + if (pcmod == 0 && start == 0x1) + ps_desired = POWER_STATE_0; + if (pcmod == 0 && start == 0x0) + ps_desired = lowest_pow_st; + break; + case NVME_POWER_STATE_ACTIVE: + /* Action unspecified if POWER CONDITION MODIFIER != 0 */ + if (pcmod == 0) + ps_desired = POWER_STATE_0; + break; + case NVME_POWER_STATE_IDLE: + /* Action unspecified if POWER CONDITION MODIFIER != [0,1,2] */ + /* min of desired state and (lps-1) because lps is STOP */ + if (pcmod == 0x0) + ps_desired = min(POWER_STATE_1, (lowest_pow_st - 1)); + else if (pcmod == 0x1) + ps_desired = min(POWER_STATE_2, (lowest_pow_st - 1)); + else if (pcmod == 0x2) + ps_desired = min(POWER_STATE_3, (lowest_pow_st - 1)); + break; + case NVME_POWER_STATE_STANDBY: + /* Action unspecified if POWER CONDITION MODIFIER != [0,1] */ + if (pcmod == 0x0) + ps_desired = max(0, (lowest_pow_st - 2)); + else if (pcmod == 0x1) + ps_desired = max(0, (lowest_pow_st - 1)); + break; + case NVME_POWER_STATE_LU_CONTROL: + default: + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + nvme_sc = nvme_set_features(dev, NVME_FEAT_POWER_MGMT, ps_desired, 0, + NULL); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_dma; + if (nvme_sc) + res = nvme_sc; + out_dma: + dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ctrl), mem, + dma_addr); + out: + return res; +} + +/* Write Buffer Helper Functions */ +/* Also using this for Format Unit with hdr passed as NULL, and buffer_id, 0 */ + +static int nvme_trans_send_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 opcode, u32 tot_len, u32 offset, + u8 buffer_id) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + struct nvme_dev *dev = ns->dev; + struct nvme_command c; + struct nvme_iod *iod = NULL; + unsigned length; + + memset(&c, 0, sizeof(c)); + c.common.opcode = opcode; + if (opcode == nvme_admin_download_fw) { + if (hdr->iovec_count > 0) { + /* Assuming SGL is not allowed for this command */ + res = nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, + SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out; + } + iod = nvme_map_user_pages(dev, DMA_TO_DEVICE, + (unsigned long)hdr->dxferp, tot_len); + if (IS_ERR(iod)) { + res = PTR_ERR(iod); + goto out; + } + length = nvme_setup_prps(dev, &c.common, iod, tot_len, + GFP_KERNEL); + if (length != tot_len) { + res = -ENOMEM; + goto out_unmap; + } + + c.dlfw.numd = (tot_len/BYTES_TO_DWORDS) - 1; + c.dlfw.offset = offset/BYTES_TO_DWORDS; + } else if (opcode == nvme_admin_activate_fw) { + c.common.cdw10[0] = buffer_id; + /* AA=01b Replace & activate at reset */ + c.common.cdw10[0] |= 0x00000008; + } + + nvme_sc = nvme_submit_admin_cmd(dev, &c, NULL); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_unmap; + if (nvme_sc) + res = nvme_sc; + + out_unmap: + if (opcode == nvme_admin_download_fw) { + nvme_unmap_user_pages(dev, DMA_TO_DEVICE, iod); + nvme_free_iod(dev, iod); + } + out: + return res; +} + +/* Mode Select Helper Functions */ + +static inline void nvme_trans_modesel_get_bd_len(u8 *parm_list, u8 cdb10, + u16 *bd_len, u8 *llbaa) +{ + if (cdb10) { + /* 10 Byte CDB */ + *bd_len = (parm_list[MODE_SELECT_10_BD_OFFSET] << 8) + + parm_list[MODE_SELECT_10_BD_OFFSET + 1]; + *llbaa = parm_list[MODE_SELECT_10_LLBAA_OFFSET] && + MODE_SELECT_10_LLBAA_MASK; + } else { + /* 6 Byte CDB */ + *bd_len = parm_list[MODE_SELECT_6_BD_OFFSET]; + } +} + +static void nvme_trans_modesel_save_bd(struct nvme_ns *ns, u8 *parm_list, + u16 idx, u16 bd_len, u8 llbaa) +{ + u16 bd_num; + + bd_num = bd_len / ((llbaa == 0) ? + SHORT_DESC_BLOCK : LONG_DESC_BLOCK); + /* Store block descriptor info if a FORMAT UNIT comes later */ + /* TODO Saving 1st BD info; what to do if multiple BD received? */ + if (llbaa == 0) { + /* Standard Block Descriptor - spc4r34 7.5.5.1 */ + ns->mode_select_num_blocks = + (parm_list[idx + 1] << 16) + + (parm_list[idx + 2] << 8) + + (parm_list[idx + 3]); + + ns->mode_select_block_len = + (parm_list[idx + 5] << 16) + + (parm_list[idx + 6] << 8) + + (parm_list[idx + 7]); + } else { + /* Long LBA Block Descriptor - sbc3r27 6.4.2.3 */ + ns->mode_select_num_blocks = + (((u64)parm_list[idx + 0]) << 56) + + (((u64)parm_list[idx + 1]) << 48) + + (((u64)parm_list[idx + 2]) << 40) + + (((u64)parm_list[idx + 3]) << 32) + + (((u64)parm_list[idx + 4]) << 24) + + (((u64)parm_list[idx + 5]) << 16) + + (((u64)parm_list[idx + 6]) << 8) + + ((u64)parm_list[idx + 7]); + + ns->mode_select_block_len = + (parm_list[idx + 12] << 24) + + (parm_list[idx + 13] << 16) + + (parm_list[idx + 14] << 8) + + (parm_list[idx + 15]); + } +} + +static u16 nvme_trans_modesel_get_mp(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *mode_page, u8 page_code) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + struct nvme_dev *dev = ns->dev; + unsigned dword11; + + switch (page_code) { + case MODE_PAGE_CACHING: + dword11 = ((mode_page[2] & CACHING_MODE_PAGE_WCE_MASK) ? 1 : 0); + nvme_sc = nvme_set_features(dev, NVME_FEAT_VOLATILE_WC, dword11, + 0, NULL); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + break; + if (nvme_sc) { + res = nvme_sc; + break; + } + break; + case MODE_PAGE_CONTROL: + break; + case MODE_PAGE_POWER_CONDITION: + /* Verify the OS is not trying to set timers */ + if ((mode_page[2] & 0x01) != 0 || (mode_page[3] & 0x0F) != 0) { + res = nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, + SCSI_ASC_INVALID_PARAMETER, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + if (!res) + res = SNTI_INTERNAL_ERROR; + break; + } + break; + default: + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + if (!res) + res = SNTI_INTERNAL_ERROR; + break; + } + + return res; +} + +static int nvme_trans_modesel_data(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd, u16 parm_list_len, u8 pf, + u8 sp, u8 cdb10) +{ + int res = SNTI_TRANSLATION_SUCCESS; + u8 *parm_list; + u16 bd_len; + u8 llbaa = 0; + u16 index, saved_index; + u8 page_code; + u16 mp_size; + + /* Get parm list from data-in/out buffer */ + parm_list = kmalloc(parm_list_len, GFP_KERNEL); + if (parm_list == NULL) { + res = -ENOMEM; + goto out; + } + + res = nvme_trans_copy_from_user(hdr, parm_list, parm_list_len); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out_mem; + + nvme_trans_modesel_get_bd_len(parm_list, cdb10, &bd_len, &llbaa); + index = (cdb10) ? (MODE_SELECT_10_MPH_SIZE) : (MODE_SELECT_6_MPH_SIZE); + + if (bd_len != 0) { + /* Block Descriptors present, parse */ + nvme_trans_modesel_save_bd(ns, parm_list, index, bd_len, llbaa); + index += bd_len; + } + saved_index = index; + + /* Multiple mode pages may be present; iterate through all */ + /* In 1st Iteration, don't do NVME Command, only check for CDB errors */ + do { + page_code = parm_list[index] & MODE_SELECT_PAGE_CODE_MASK; + mp_size = parm_list[index + 1] + 2; + if ((page_code != MODE_PAGE_CACHING) && + (page_code != MODE_PAGE_CONTROL) && + (page_code != MODE_PAGE_POWER_CONDITION)) { + res = nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, + SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out_mem; + } + index += mp_size; + } while (index < parm_list_len); + + /* In 2nd Iteration, do the NVME Commands */ + index = saved_index; + do { + page_code = parm_list[index] & MODE_SELECT_PAGE_CODE_MASK; + mp_size = parm_list[index + 1] + 2; + res = nvme_trans_modesel_get_mp(ns, hdr, &parm_list[index], + page_code); + if (res != SNTI_TRANSLATION_SUCCESS) + break; + index += mp_size; + } while (index < parm_list_len); + + out_mem: + kfree(parm_list); + out: + return res; +} + +/* Format Unit Helper Functions */ + +static int nvme_trans_fmt_set_blk_size_count(struct nvme_ns *ns, + struct sg_io_hdr *hdr) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + struct nvme_dev *dev = ns->dev; + dma_addr_t dma_addr; + void *mem; + struct nvme_id_ns *id_ns; + u8 flbas; + + /* + * SCSI Expects a MODE SELECT would have been issued prior to + * a FORMAT UNIT, and the block size and number would be used + * from the block descriptor in it. If a MODE SELECT had not + * been issued, FORMAT shall use the current values for both. + */ + + if (ns->mode_select_num_blocks == 0 || ns->mode_select_block_len == 0) { + mem = dma_alloc_coherent(&dev->pci_dev->dev, + sizeof(struct nvme_id_ns), &dma_addr, GFP_KERNEL); + if (mem == NULL) { + res = -ENOMEM; + goto out; + } + /* nvme ns identify */ + nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_dma; + if (nvme_sc) { + res = nvme_sc; + goto out_dma; + } + id_ns = mem; + + if (ns->mode_select_num_blocks == 0) + ns->mode_select_num_blocks = id_ns->ncap; + if (ns->mode_select_block_len == 0) { + flbas = (id_ns->flbas) & 0x0F; + ns->mode_select_block_len = + (1 << (id_ns->lbaf[flbas].ds)); + } + out_dma: + dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), + mem, dma_addr); + } + out: + return res; +} + +static int nvme_trans_fmt_get_parm_header(struct sg_io_hdr *hdr, u8 len, + u8 format_prot_info, u8 *nvme_pf_code) +{ + int res = SNTI_TRANSLATION_SUCCESS; + u8 *parm_list; + u8 pf_usage, pf_code; + + parm_list = kmalloc(len, GFP_KERNEL); + if (parm_list == NULL) { + res = -ENOMEM; + goto out; + } + res = nvme_trans_copy_from_user(hdr, parm_list, len); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out_mem; + + if ((parm_list[FORMAT_UNIT_IMMED_OFFSET] & + FORMAT_UNIT_IMMED_MASK) != 0) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out_mem; + } + + if (len == FORMAT_UNIT_LONG_PARM_LIST_LEN && + (parm_list[FORMAT_UNIT_PROT_INT_OFFSET] & 0x0F) != 0) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out_mem; + } + pf_usage = parm_list[FORMAT_UNIT_PROT_FIELD_USAGE_OFFSET] & + FORMAT_UNIT_PROT_FIELD_USAGE_MASK; + pf_code = (pf_usage << 2) | format_prot_info; + switch (pf_code) { + case 0: + *nvme_pf_code = 0; + break; + case 2: + *nvme_pf_code = 1; + break; + case 3: + *nvme_pf_code = 2; + break; + case 7: + *nvme_pf_code = 3; + break; + default: + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + + out_mem: + kfree(parm_list); + out: + return res; +} + +static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 prot_info) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + struct nvme_dev *dev = ns->dev; + dma_addr_t dma_addr; + void *mem; + struct nvme_id_ns *id_ns; + u8 i; + u8 flbas, nlbaf; + u8 selected_lbaf = 0xFF; + u32 cdw10 = 0; + struct nvme_command c; + + /* Loop thru LBAF's in id_ns to match reqd lbaf, put in cdw10 */ + mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), + &dma_addr, GFP_KERNEL); + if (mem == NULL) { + res = -ENOMEM; + goto out; + } + /* nvme ns identify */ + nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_dma; + if (nvme_sc) { + res = nvme_sc; + goto out_dma; + } + id_ns = mem; + flbas = (id_ns->flbas) & 0x0F; + nlbaf = id_ns->nlbaf; + + for (i = 0; i < nlbaf; i++) { + if (ns->mode_select_block_len == (1 << (id_ns->lbaf[i].ds))) { + selected_lbaf = i; + break; + } + } + if (selected_lbaf > 0x0F) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_PARAMETER, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + } + if (ns->mode_select_num_blocks != id_ns->ncap) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_PARAMETER, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + } + + cdw10 |= prot_info << 5; + cdw10 |= selected_lbaf & 0x0F; + memset(&c, 0, sizeof(c)); + c.format.opcode = nvme_admin_format_nvm; + c.format.nsid = ns->ns_id; + c.format.cdw10 = cpu_to_le32(cdw10); + + nvme_sc = nvme_submit_admin_cmd(dev, &c, NULL); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_dma; + if (nvme_sc) + res = nvme_sc; + + out_dma: + dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem, + dma_addr); + out: + return res; +} + +/* Read/Write Helper Functions */ + +static inline void nvme_trans_get_io_cdb6(u8 *cmd, + struct nvme_trans_io_cdb *cdb_info) +{ + cdb_info->fua = 0; + cdb_info->prot_info = 0; + cdb_info->lba = GET_U32_FROM_CDB(cmd, IO_6_CDB_LBA_OFFSET) & + IO_6_CDB_LBA_MASK; + cdb_info->xfer_len = GET_U8_FROM_CDB(cmd, IO_6_CDB_TX_LEN_OFFSET); + + /* sbc3r27 sec 5.32 - TRANSFER LEN of 0 implies a 256 Block transfer */ + if (cdb_info->xfer_len == 0) + cdb_info->xfer_len = IO_6_DEFAULT_TX_LEN; +} + +static inline void nvme_trans_get_io_cdb10(u8 *cmd, + struct nvme_trans_io_cdb *cdb_info) +{ + cdb_info->fua = GET_U8_FROM_CDB(cmd, IO_10_CDB_FUA_OFFSET) & + IO_CDB_FUA_MASK; + cdb_info->prot_info = GET_U8_FROM_CDB(cmd, IO_10_CDB_WP_OFFSET) & + IO_CDB_WP_MASK >> IO_CDB_WP_SHIFT; + cdb_info->lba = GET_U32_FROM_CDB(cmd, IO_10_CDB_LBA_OFFSET); + cdb_info->xfer_len = GET_U16_FROM_CDB(cmd, IO_10_CDB_TX_LEN_OFFSET); +} + +static inline void nvme_trans_get_io_cdb12(u8 *cmd, + struct nvme_trans_io_cdb *cdb_info) +{ + cdb_info->fua = GET_U8_FROM_CDB(cmd, IO_12_CDB_FUA_OFFSET) & + IO_CDB_FUA_MASK; + cdb_info->prot_info = GET_U8_FROM_CDB(cmd, IO_12_CDB_WP_OFFSET) & + IO_CDB_WP_MASK >> IO_CDB_WP_SHIFT; + cdb_info->lba = GET_U32_FROM_CDB(cmd, IO_12_CDB_LBA_OFFSET); + cdb_info->xfer_len = GET_U32_FROM_CDB(cmd, IO_12_CDB_TX_LEN_OFFSET); +} + +static inline void nvme_trans_get_io_cdb16(u8 *cmd, + struct nvme_trans_io_cdb *cdb_info) +{ + cdb_info->fua = GET_U8_FROM_CDB(cmd, IO_16_CDB_FUA_OFFSET) & + IO_CDB_FUA_MASK; + cdb_info->prot_info = GET_U8_FROM_CDB(cmd, IO_16_CDB_WP_OFFSET) & + IO_CDB_WP_MASK >> IO_CDB_WP_SHIFT; + cdb_info->lba = GET_U64_FROM_CDB(cmd, IO_16_CDB_LBA_OFFSET); + cdb_info->xfer_len = GET_U32_FROM_CDB(cmd, IO_16_CDB_TX_LEN_OFFSET); +} + +static inline u32 nvme_trans_io_get_num_cmds(struct sg_io_hdr *hdr, + struct nvme_trans_io_cdb *cdb_info, + u32 max_blocks) +{ + /* If using iovecs, send one nvme command per vector */ + if (hdr->iovec_count > 0) + return hdr->iovec_count; + else if (cdb_info->xfer_len > max_blocks) + return ((cdb_info->xfer_len - 1) / max_blocks) + 1; + else + return 1; +} + +static u16 nvme_trans_io_get_control(struct nvme_ns *ns, + struct nvme_trans_io_cdb *cdb_info) +{ + u16 control = 0; + + /* When Protection information support is added, implement here */ + + if (cdb_info->fua > 0) + control |= NVME_RW_FUA; + + return control; +} + +static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, + struct nvme_trans_io_cdb *cdb_info, u8 is_write) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + struct nvme_dev *dev = ns->dev; + struct nvme_queue *nvmeq = get_nvmeq(ns->dev); + u32 num_cmds; + struct nvme_iod *iod; + u64 unit_len; + u64 unit_num_blocks; /* Number of blocks to xfer in each nvme cmd */ + u32 retcode; + u32 i = 0; + u64 nvme_offset = 0; + void *next_mapping_addr; + struct nvme_command c; + u8 opcode = (is_write ? nvme_cmd_write : nvme_cmd_read); + u16 control; + u32 max_blocks = (dev->max_hw_sectors << 9) >> ns->lba_shift; + + num_cmds = nvme_trans_io_get_num_cmds(hdr, cdb_info, max_blocks); + + /* + * This loop handles two cases. + * First, when an SGL is used in the form of an iovec list: + * - Use iov_base as the next mapping address for the nvme command_id + * - Use iov_len as the data transfer length for the command. + * Second, when we have a single buffer + * - If larger than max_blocks, split into chunks, offset + * each nvme command accordingly. + */ + for (i = 0; i < num_cmds; i++) { + memset(&c, 0, sizeof(c)); + if (hdr->iovec_count > 0) { + struct sg_iovec *sgl = hdr->dxferp; + + unit_len = sgl[i].iov_len; + unit_num_blocks = unit_len >> ns->lba_shift; + next_mapping_addr = sgl[i].iov_base; + } else { + unit_num_blocks = min((u64)max_blocks, + (cdb_info->xfer_len - nvme_offset)); + unit_len = unit_num_blocks << ns->lba_shift; + next_mapping_addr = hdr->dxferp + + ((1 << ns->lba_shift) * nvme_offset); + } + + c.rw.opcode = opcode; + c.rw.nsid = cpu_to_le32(ns->ns_id); + c.rw.slba = cpu_to_le64(cdb_info->lba + nvme_offset); + c.rw.length = cpu_to_le16(unit_num_blocks - 1); + control = nvme_trans_io_get_control(ns, cdb_info); + c.rw.control = cpu_to_le16(control); + + iod = nvme_map_user_pages(dev, + (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, + (unsigned long)next_mapping_addr, unit_len); + if (IS_ERR(iod)) { + res = PTR_ERR(iod); + goto out; + } + retcode = nvme_setup_prps(dev, &c.common, iod, unit_len, + GFP_KERNEL); + if (retcode != unit_len) { + nvme_unmap_user_pages(dev, + (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, + iod); + nvme_free_iod(dev, iod); + res = -ENOMEM; + goto out; + } + + nvme_offset += unit_num_blocks; + + nvmeq = get_nvmeq(dev); + /* + * Since nvme_submit_sync_cmd sleeps, we can't keep + * preemption disabled. We may be preempted at any + * point, and be rescheduled to a different CPU. That + * will cause cacheline bouncing, but no additional + * races since q_lock already protects against other + * CPUs. + */ + put_nvmeq(nvmeq); + nvme_sc = nvme_submit_sync_cmd(nvmeq, &c, NULL, + NVME_IO_TIMEOUT); + if (nvme_sc != NVME_SC_SUCCESS) { + nvme_unmap_user_pages(dev, + (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, + iod); + nvme_free_iod(dev, iod); + res = nvme_trans_status_code(hdr, nvme_sc); + goto out; + } + nvme_unmap_user_pages(dev, + (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, + iod); + nvme_free_iod(dev, iod); + } + res = nvme_trans_status_code(hdr, NVME_SC_SUCCESS); + + out: + return res; +} + + +/* SCSI Command Translation Functions */ + +static int nvme_trans_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 is_write, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + struct nvme_trans_io_cdb cdb_info; + u8 opcode = cmd[0]; + u64 xfer_bytes; + u64 sum_iov_len = 0; + struct sg_iovec *sgl; + int i; + + /* Extract Fields from CDB */ + switch (opcode) { + case WRITE_6: + case READ_6: + nvme_trans_get_io_cdb6(cmd, &cdb_info); + break; + case WRITE_10: + case READ_10: + nvme_trans_get_io_cdb10(cmd, &cdb_info); + break; + case WRITE_12: + case READ_12: + nvme_trans_get_io_cdb12(cmd, &cdb_info); + break; + case WRITE_16: + case READ_16: + nvme_trans_get_io_cdb16(cmd, &cdb_info); + break; + default: + /* Will never really reach here */ + res = SNTI_INTERNAL_ERROR; + goto out; + } + + /* Calculate total length of transfer (in bytes) */ + if (hdr->iovec_count > 0) { + sgl = hdr->dxferp; + for (i = 0; i < hdr->iovec_count; i++) { + sum_iov_len += sgl[i].iov_len; + /* IO vector sizes should be multiples of block size */ + if (sgl[i].iov_len % (1 << ns->lba_shift) != 0) { + res = nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, + SCSI_ASC_INVALID_PARAMETER, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out; + } + } + } else { + sum_iov_len = hdr->dxfer_len; + } + + /* As Per sg ioctl howto, if the lengths differ, use the lower one */ + xfer_bytes = min(((u64)hdr->dxfer_len), sum_iov_len); + + /* If block count and actual data buffer size dont match, error out */ + if (xfer_bytes != (cdb_info.xfer_len << ns->lba_shift)) { + res = -EINVAL; + goto out; + } + + /* Check for 0 length transfer - it is not illegal */ + if (cdb_info.xfer_len == 0) + goto out; + + /* Send NVMe IO Command(s) */ + res = nvme_trans_do_nvme_io(ns, hdr, &cdb_info, is_write); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out; + + out: + return res; +} + +static int nvme_trans_inquiry(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + u8 evpd; + u8 page_code; + int alloc_len; + u8 *inq_response; + + evpd = GET_INQ_EVPD_BIT(cmd); + page_code = GET_INQ_PAGE_CODE(cmd); + alloc_len = GET_INQ_ALLOC_LENGTH(cmd); + + inq_response = kmalloc(STANDARD_INQUIRY_LENGTH, GFP_KERNEL); + if (inq_response == NULL) { + res = -ENOMEM; + goto out_mem; + } + + if (evpd == 0) { + if (page_code == INQ_STANDARD_INQUIRY_PAGE) { + res = nvme_trans_standard_inquiry_page(ns, hdr, + inq_response, alloc_len); + } else { + res = nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, + SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + } + } else { + switch (page_code) { + case VPD_SUPPORTED_PAGES: + res = nvme_trans_supported_vpd_pages(ns, hdr, + inq_response, alloc_len); + break; + case VPD_SERIAL_NUMBER: + res = nvme_trans_unit_serial_page(ns, hdr, inq_response, + alloc_len); + break; + case VPD_DEVICE_IDENTIFIERS: + res = nvme_trans_device_id_page(ns, hdr, inq_response, + alloc_len); + break; + case VPD_EXTENDED_INQUIRY: + res = nvme_trans_ext_inq_page(ns, hdr, alloc_len); + break; + case VPD_BLOCK_DEV_CHARACTERISTICS: + res = nvme_trans_bdev_char_page(ns, hdr, alloc_len); + break; + default: + res = nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, + SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + } + kfree(inq_response); + out_mem: + return res; +} + +static int nvme_trans_log_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + u16 alloc_len; + u8 sp; + u8 pc; + u8 page_code; + + sp = GET_U8_FROM_CDB(cmd, LOG_SENSE_CDB_SP_OFFSET); + if (sp != LOG_SENSE_CDB_SP_NOT_ENABLED) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out; + } + pc = GET_U8_FROM_CDB(cmd, LOG_SENSE_CDB_PC_OFFSET); + page_code = pc & LOG_SENSE_CDB_PAGE_CODE_MASK; + pc = (pc & LOG_SENSE_CDB_PC_MASK) >> LOG_SENSE_CDB_PC_SHIFT; + if (pc != LOG_SENSE_CDB_PC_CUMULATIVE_VALUES) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out; + } + alloc_len = GET_U16_FROM_CDB(cmd, LOG_SENSE_CDB_ALLOC_LENGTH_OFFSET); + switch (page_code) { + case LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE: + res = nvme_trans_log_supp_pages(ns, hdr, alloc_len); + break; + case LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE: + res = nvme_trans_log_info_exceptions(ns, hdr, alloc_len); + break; + case LOG_PAGE_TEMPERATURE_PAGE: + res = nvme_trans_log_temperature(ns, hdr, alloc_len); + break; + default: + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + + out: + return res; +} + +static int nvme_trans_mode_select(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + u8 cdb10 = 0; + u16 parm_list_len; + u8 page_format; + u8 save_pages; + + page_format = GET_U8_FROM_CDB(cmd, MODE_SELECT_CDB_PAGE_FORMAT_OFFSET); + page_format &= MODE_SELECT_CDB_PAGE_FORMAT_MASK; + + save_pages = GET_U8_FROM_CDB(cmd, MODE_SELECT_CDB_SAVE_PAGES_OFFSET); + save_pages &= MODE_SELECT_CDB_SAVE_PAGES_MASK; + + if (GET_OPCODE(cmd) == MODE_SELECT) { + parm_list_len = GET_U8_FROM_CDB(cmd, + MODE_SELECT_6_CDB_PARAM_LIST_LENGTH_OFFSET); + } else { + parm_list_len = GET_U16_FROM_CDB(cmd, + MODE_SELECT_10_CDB_PARAM_LIST_LENGTH_OFFSET); + cdb10 = 1; + } + + if (parm_list_len != 0) { + /* + * According to SPC-4 r24, a paramter list length field of 0 + * shall not be considered an error + */ + res = nvme_trans_modesel_data(ns, hdr, cmd, parm_list_len, + page_format, save_pages, cdb10); + } + + return res; +} + +static int nvme_trans_mode_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + u16 alloc_len; + u8 cdb10 = 0; + u8 page_code; + u8 pc; + + if (GET_OPCODE(cmd) == MODE_SENSE) { + alloc_len = GET_U8_FROM_CDB(cmd, MODE_SENSE6_ALLOC_LEN_OFFSET); + } else { + alloc_len = GET_U16_FROM_CDB(cmd, + MODE_SENSE10_ALLOC_LEN_OFFSET); + cdb10 = 1; + } + + pc = GET_U8_FROM_CDB(cmd, MODE_SENSE_PAGE_CONTROL_OFFSET) & + MODE_SENSE_PAGE_CONTROL_MASK; + if (pc != MODE_SENSE_PC_CURRENT_VALUES) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out; + } + + page_code = GET_U8_FROM_CDB(cmd, MODE_SENSE_PAGE_CODE_OFFSET) & + MODE_SENSE_PAGE_CODE_MASK; + switch (page_code) { + case MODE_PAGE_CACHING: + res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, + cdb10, + &nvme_trans_fill_caching_page, + MODE_PAGE_CACHING_LEN); + break; + case MODE_PAGE_CONTROL: + res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, + cdb10, + &nvme_trans_fill_control_page, + MODE_PAGE_CONTROL_LEN); + break; + case MODE_PAGE_POWER_CONDITION: + res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, + cdb10, + &nvme_trans_fill_pow_cnd_page, + MODE_PAGE_POW_CND_LEN); + break; + case MODE_PAGE_INFO_EXCEP: + res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, + cdb10, + &nvme_trans_fill_inf_exc_page, + MODE_PAGE_INF_EXC_LEN); + break; + case MODE_PAGE_RETURN_ALL: + res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, + cdb10, + &nvme_trans_fill_all_pages, + MODE_PAGE_ALL_LEN); + break; + default: + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + + out: + return res; +} + +static int nvme_trans_read_capacity(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + u32 alloc_len = READ_CAP_10_RESP_SIZE; + u32 resp_size = READ_CAP_10_RESP_SIZE; + u32 xfer_len; + u8 cdb16; + struct nvme_dev *dev = ns->dev; + dma_addr_t dma_addr; + void *mem; + struct nvme_id_ns *id_ns; + u8 *response; + + cdb16 = IS_READ_CAP_16(cmd); + if (cdb16) { + alloc_len = GET_READ_CAP_16_ALLOC_LENGTH(cmd); + resp_size = READ_CAP_16_RESP_SIZE; + } + + mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), + &dma_addr, GFP_KERNEL); + if (mem == NULL) { + res = -ENOMEM; + goto out; + } + /* nvme ns identify */ + nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_dma; + if (nvme_sc) { + res = nvme_sc; + goto out_dma; + } + id_ns = mem; + + response = kmalloc(resp_size, GFP_KERNEL); + if (response == NULL) { + res = -ENOMEM; + goto out_dma; + } + memset(response, 0, resp_size); + nvme_trans_fill_read_cap(response, id_ns, cdb16); + + xfer_len = min(alloc_len, resp_size); + res = nvme_trans_copy_to_user(hdr, response, xfer_len); + + kfree(response); + out_dma: + dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem, + dma_addr); + out: + return res; +} + +static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + u32 alloc_len, xfer_len, resp_size; + u8 select_report; + u8 *response; + struct nvme_dev *dev = ns->dev; + dma_addr_t dma_addr; + void *mem; + struct nvme_id_ctrl *id_ctrl; + u32 ll_length, lun_id; + u8 lun_id_offset = REPORT_LUNS_FIRST_LUN_OFFSET; + u32 tmp_len; + + alloc_len = GET_REPORT_LUNS_ALLOC_LENGTH(cmd); + select_report = GET_U8_FROM_CDB(cmd, REPORT_LUNS_SR_OFFSET); + + if ((select_report != ALL_LUNS_RETURNED) && + (select_report != ALL_WELL_KNOWN_LUNS_RETURNED) && + (select_report != RESTRICTED_LUNS_RETURNED)) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out; + } else { + /* NVMe Controller Identify */ + mem = dma_alloc_coherent(&dev->pci_dev->dev, + sizeof(struct nvme_id_ctrl), + &dma_addr, GFP_KERNEL); + if (mem == NULL) { + res = -ENOMEM; + goto out; + } + nvme_sc = nvme_identify(dev, 0, 1, dma_addr); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_dma; + if (nvme_sc) { + res = nvme_sc; + goto out_dma; + } + id_ctrl = mem; + ll_length = id_ctrl->nn * LUN_ENTRY_SIZE; + resp_size = ll_length + LUN_DATA_HEADER_SIZE; + + if (alloc_len < resp_size) { + res = nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out_dma; + } + + response = kmalloc(resp_size, GFP_KERNEL); + if (response == NULL) { + res = -ENOMEM; + goto out_dma; + } + memset(response, 0, resp_size); + + /* The first LUN ID will always be 0 per the SAM spec */ + for (lun_id = 0; lun_id < id_ctrl->nn; lun_id++) { + /* + * Set the LUN Id and then increment to the next LUN + * location in the parameter data. + */ + u64 tmp_id = cpu_to_be64(lun_id); + memcpy(&response[lun_id_offset], &tmp_id, sizeof(u64)); + lun_id_offset += LUN_ENTRY_SIZE; + } + tmp_len = cpu_to_be32(ll_length); + memcpy(response, &tmp_len, sizeof(u32)); + } + + xfer_len = min(alloc_len, resp_size); + res = nvme_trans_copy_to_user(hdr, response, xfer_len); + + kfree(response); + out_dma: + dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ctrl), mem, + dma_addr); + out: + return res; +} + +static int nvme_trans_request_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + u8 alloc_len, xfer_len, resp_size; + u8 desc_format; + u8 *response; + + alloc_len = GET_REQUEST_SENSE_ALLOC_LENGTH(cmd); + desc_format = GET_U8_FROM_CDB(cmd, REQUEST_SENSE_DESC_OFFSET); + desc_format &= REQUEST_SENSE_DESC_MASK; + + resp_size = ((desc_format) ? (DESC_FMT_SENSE_DATA_SIZE) : + (FIXED_FMT_SENSE_DATA_SIZE)); + response = kmalloc(resp_size, GFP_KERNEL); + if (response == NULL) { + res = -ENOMEM; + goto out; + } + memset(response, 0, resp_size); + + if (desc_format == DESCRIPTOR_FORMAT_SENSE_DATA_TYPE) { + /* Descriptor Format Sense Data */ + response[0] = DESC_FORMAT_SENSE_DATA; + response[1] = NO_SENSE; + /* TODO How is LOW POWER CONDITION ON handled? (byte 2) */ + response[2] = SCSI_ASC_NO_SENSE; + response[3] = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + /* SDAT_OVFL = 0 | Additional Sense Length = 0 */ + } else { + /* Fixed Format Sense Data */ + response[0] = FIXED_SENSE_DATA; + /* Byte 1 = Obsolete */ + response[2] = NO_SENSE; /* FM, EOM, ILI, SDAT_OVFL = 0 */ + /* Bytes 3-6 - Information - set to zero */ + response[7] = FIXED_SENSE_DATA_ADD_LENGTH; + /* Bytes 8-11 - Cmd Specific Information - set to zero */ + response[12] = SCSI_ASC_NO_SENSE; + response[13] = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + /* Byte 14 = Field Replaceable Unit Code = 0 */ + /* Bytes 15-17 - SKSV=0; Sense Key Specific = 0 */ + } + + xfer_len = min(alloc_len, resp_size); + res = nvme_trans_copy_to_user(hdr, response, xfer_len); + + kfree(response); + out: + return res; +} + +static int nvme_trans_security_protocol(struct nvme_ns *ns, + struct sg_io_hdr *hdr, + u8 *cmd) +{ + return nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_ILLEGAL_COMMAND, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); +} + +static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + struct nvme_queue *nvmeq = get_nvmeq(ns->dev); + u8 immed, pcmod, pc, no_flush, start; + + immed = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_IMMED_OFFSET); + pcmod = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_POWER_COND_MOD_OFFSET); + pc = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_POWER_COND_OFFSET); + no_flush = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_NO_FLUSH_OFFSET); + start = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_START_OFFSET); + + immed &= START_STOP_UNIT_CDB_IMMED_MASK; + pcmod &= START_STOP_UNIT_CDB_POWER_COND_MOD_MASK; + pc = (pc & START_STOP_UNIT_CDB_POWER_COND_MASK) >> NIBBLE_SHIFT; + no_flush &= START_STOP_UNIT_CDB_NO_FLUSH_MASK; + start &= START_STOP_UNIT_CDB_START_MASK; + + if (immed != 0) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + } else { + if (no_flush == 0) { + /* Issue NVME FLUSH command prior to START STOP UNIT */ + nvme_sc = nvme_submit_flush_data(nvmeq, ns); + put_nvmeq(nvmeq); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out; + if (nvme_sc) { + res = nvme_sc; + goto out; + } + } + /* Setup the expected power state transition */ + res = nvme_trans_power_state(ns, hdr, pc, pcmod, start); + } + + out: + return res; +} + +static int nvme_trans_synchronize_cache(struct nvme_ns *ns, + struct sg_io_hdr *hdr, u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + struct nvme_queue *nvmeq = get_nvmeq(ns->dev); + put_nvmeq(nvmeq); + nvme_sc = nvme_submit_flush_data(nvmeq, ns); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out; + if (nvme_sc) + res = nvme_sc; + + out: + return res; +} + +static int nvme_trans_format_unit(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + u8 parm_hdr_len = 0; + u8 nvme_pf_code = 0; + u8 format_prot_info, long_list, format_data; + + format_prot_info = GET_U8_FROM_CDB(cmd, + FORMAT_UNIT_CDB_FORMAT_PROT_INFO_OFFSET); + long_list = GET_U8_FROM_CDB(cmd, FORMAT_UNIT_CDB_LONG_LIST_OFFSET); + format_data = GET_U8_FROM_CDB(cmd, FORMAT_UNIT_CDB_FORMAT_DATA_OFFSET); + + format_prot_info = (format_prot_info & + FORMAT_UNIT_CDB_FORMAT_PROT_INFO_MASK) >> + FORMAT_UNIT_CDB_FORMAT_PROT_INFO_SHIFT; + long_list &= FORMAT_UNIT_CDB_LONG_LIST_MASK; + format_data &= FORMAT_UNIT_CDB_FORMAT_DATA_MASK; + + if (format_data != 0) { + if (format_prot_info != 0) { + if (long_list == 0) + parm_hdr_len = FORMAT_UNIT_SHORT_PARM_LIST_LEN; + else + parm_hdr_len = FORMAT_UNIT_LONG_PARM_LIST_LEN; + } + } else if (format_data == 0 && format_prot_info != 0) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out; + } + + /* Get parm header from data-in/out buffer */ + /* + * According to the translation spec, the only fields in the parameter + * list we are concerned with are in the header. So allocate only that. + */ + if (parm_hdr_len > 0) { + res = nvme_trans_fmt_get_parm_header(hdr, parm_hdr_len, + format_prot_info, &nvme_pf_code); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out; + } + + /* Attempt to activate any previously downloaded firmware image */ + res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_activate_fw, 0, 0, 0); + + /* Determine Block size and count and send format command */ + res = nvme_trans_fmt_set_blk_size_count(ns, hdr); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out; + + res = nvme_trans_fmt_send_cmd(ns, hdr, nvme_pf_code); + + out: + return res; +} + +static int nvme_trans_test_unit_ready(struct nvme_ns *ns, + struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + struct nvme_dev *dev = ns->dev; + + if (!(readl(&dev->bar->csts) & NVME_CSTS_RDY)) + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + NOT_READY, SCSI_ASC_LUN_NOT_READY, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + else + res = nvme_trans_completion(hdr, SAM_STAT_GOOD, NO_SENSE, 0, 0); + + return res; +} + +static int nvme_trans_write_buffer(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + u32 buffer_offset, parm_list_length; + u8 buffer_id, mode; + + parm_list_length = + GET_U24_FROM_CDB(cmd, WRITE_BUFFER_CDB_PARM_LIST_LENGTH_OFFSET); + if (parm_list_length % BYTES_TO_DWORDS != 0) { + /* NVMe expects Firmware file to be a whole number of DWORDS */ + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out; + } + buffer_id = GET_U8_FROM_CDB(cmd, WRITE_BUFFER_CDB_BUFFER_ID_OFFSET); + if (buffer_id > NVME_MAX_FIRMWARE_SLOT) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out; + } + mode = GET_U8_FROM_CDB(cmd, WRITE_BUFFER_CDB_MODE_OFFSET) & + WRITE_BUFFER_CDB_MODE_MASK; + buffer_offset = + GET_U24_FROM_CDB(cmd, WRITE_BUFFER_CDB_BUFFER_OFFSET_OFFSET); + + switch (mode) { + case DOWNLOAD_SAVE_ACTIVATE: + res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_download_fw, + parm_list_length, buffer_offset, + buffer_id); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out; + res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_activate_fw, + parm_list_length, buffer_offset, + buffer_id); + break; + case DOWNLOAD_SAVE_DEFER_ACTIVATE: + res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_download_fw, + parm_list_length, buffer_offset, + buffer_id); + break; + case ACTIVATE_DEFERRED_MICROCODE: + res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_activate_fw, + parm_list_length, buffer_offset, + buffer_id); + break; + default: + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + + out: + return res; +} + +static int nvme_scsi_translate(struct nvme_ns *ns, struct sg_io_hdr *hdr) +{ + u8 cmd[BLK_MAX_CDB]; + int retcode; + unsigned int opcode; + + if (hdr->cmdp == NULL) + return -EMSGSIZE; + if (copy_from_user(cmd, hdr->cmdp, hdr->cmd_len)) + return -EFAULT; + + opcode = cmd[0]; + + switch (opcode) { + case READ_6: + case READ_10: + case READ_12: + case READ_16: + retcode = nvme_trans_io(ns, hdr, 0, cmd); + break; + case WRITE_6: + case WRITE_10: + case WRITE_12: + case WRITE_16: + retcode = nvme_trans_io(ns, hdr, 1, cmd); + break; + case INQUIRY: + retcode = nvme_trans_inquiry(ns, hdr, cmd); + break; + case LOG_SENSE: + retcode = nvme_trans_log_sense(ns, hdr, cmd); + break; + case MODE_SELECT: + case MODE_SELECT_10: + retcode = nvme_trans_mode_select(ns, hdr, cmd); + break; + case MODE_SENSE: + case MODE_SENSE_10: + retcode = nvme_trans_mode_sense(ns, hdr, cmd); + break; + case READ_CAPACITY: + retcode = nvme_trans_read_capacity(ns, hdr, cmd); + break; + case SERVICE_ACTION_IN: + if (IS_READ_CAP_16(cmd)) + retcode = nvme_trans_read_capacity(ns, hdr, cmd); + else + goto out; + break; + case REPORT_LUNS: + retcode = nvme_trans_report_luns(ns, hdr, cmd); + break; + case REQUEST_SENSE: + retcode = nvme_trans_request_sense(ns, hdr, cmd); + break; + case SECURITY_PROTOCOL_IN: + case SECURITY_PROTOCOL_OUT: + retcode = nvme_trans_security_protocol(ns, hdr, cmd); + break; + case START_STOP: + retcode = nvme_trans_start_stop(ns, hdr, cmd); + break; + case SYNCHRONIZE_CACHE: + retcode = nvme_trans_synchronize_cache(ns, hdr, cmd); + break; + case FORMAT_UNIT: + retcode = nvme_trans_format_unit(ns, hdr, cmd); + break; + case TEST_UNIT_READY: + retcode = nvme_trans_test_unit_ready(ns, hdr, cmd); + break; + case WRITE_BUFFER: + retcode = nvme_trans_write_buffer(ns, hdr, cmd); + break; + default: + out: + retcode = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_ILLEGAL_COMMAND, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + return retcode; +} + +int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr) +{ + struct sg_io_hdr hdr; + int retcode; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (copy_from_user(&hdr, u_hdr, sizeof(hdr))) + return -EFAULT; + if (hdr.interface_id != 'S') + return -EINVAL; + if (hdr.cmd_len > BLK_MAX_CDB) + return -EINVAL; + + retcode = nvme_scsi_translate(ns, &hdr); + if (retcode < 0) + return retcode; + if (retcode > 0) + retcode = SNTI_TRANSLATION_SUCCESS; + if (copy_to_user(__user u_hdr, &hdr, sizeof(sg_io_hdr_t)) > 0) + return -EFAULT; + + return retcode; +} + +int nvme_sg_get_version_num(int __user *ip) +{ + return put_user(sg_version_num, ip); +} diff --git a/include/linux/nvme.h b/include/linux/nvme.h index f1974cab60c..aa575033dbe 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -546,6 +546,8 @@ struct nvme_ns { int ns_id; int lba_shift; + u64 mode_select_num_blocks; + u32 mode_select_block_len; }; /* @@ -563,6 +565,39 @@ struct nvme_iod { dma_addr_t first_dma; struct scatterlist sg[0]; }; + +/** + * nvme_free_iod - frees an nvme_iod + * @dev: The device that the I/O was submitted to + * @iod: The memory to free + */ +void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod); + +int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd, + struct nvme_iod *iod, int total_len, gfp_t gfp); +struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, + unsigned long addr, unsigned length); +void nvme_unmap_user_pages(struct nvme_dev *dev, int write, + struct nvme_iod *iod); +struct nvme_queue *get_nvmeq(struct nvme_dev *dev); +void put_nvmeq(struct nvme_queue *nvmeq); +int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd, + u32 *result, unsigned timeout); +int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns); +int nvme_submit_admin_cmd(struct nvme_dev *, struct nvme_command *, + u32 *result); +int nvme_identify(struct nvme_dev *, unsigned nsid, unsigned cns, + dma_addr_t dma_addr); +int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, + dma_addr_t dma_addr, u32 *result); +int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, + dma_addr_t dma_addr, u32 *result); + +struct sg_io_hdr; + +int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr); +int nvme_sg_get_version_num(int __user *ip); + #endif #endif /* _LINUX_NVME_H */ -- cgit v1.2.3 From acb7aa0db09b8abd38abeb84334a8a27a52fbb1b Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 4 Feb 2013 14:44:33 -0800 Subject: NVMe: Use round_jiffies_relative() for the periodic, once-per-second timer The nvme driver has a "once per second" event where the management kthread wakes up the system and then reschedules itself for 1 second later. For power efficiency reasons, I'd like this timer to happen together with other wakeups in the system. This patch makes the schedule_timeout() call in the kthread use round_jiffies_relative(), causing the wakeup to at least align with other "once per X seconds" events in the kernel. Signed-off-by: Arjan van de Ven Tested-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index a89f7dbefba..32fdfe9a515 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1309,7 +1309,7 @@ static int nvme_kthread(void *data) } spin_unlock(&dev_list_lock); set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ); + schedule_timeout(round_jiffies_relative(HZ)); } return 0; } -- cgit v1.2.3 From 063cc6d5591ea9c0631b81ac5c7b829d99738b2f Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 27 Mar 2013 21:28:22 -0400 Subject: NVMe: Abstract out sector to block number conversion Introduce nvme_block_nr() to help convert sectors to block numbers. This fixes an integer overflow in the SCSI conversion layer, and it's slightly less typing than opencoding it. Signed-off-by: Matthew Wilcox Acked-by: Keith Busch --- drivers/block/nvme-core.c | 4 ++-- drivers/block/nvme-scsi.c | 2 +- include/linux/nvme.h | 5 +++++ 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 32fdfe9a515..f3ea52aa3e5 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -477,7 +477,7 @@ static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, range->cattr = cpu_to_le32(0); range->nlb = cpu_to_le32(bio->bi_size >> ns->lba_shift); - range->slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9)); + range->slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_sector)); memset(cmnd, 0, sizeof(*cmnd)); cmnd->dsm.opcode = nvme_cmd_dsm; @@ -590,7 +590,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, cmnd->rw.nsid = cpu_to_le32(ns->ns_id); length = nvme_setup_prps(nvmeq->dev, &cmnd->common, iod, length, GFP_ATOMIC); - cmnd->rw.slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9)); + cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_sector)); cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1); cmnd->rw.control = cpu_to_le16(control); cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c index 483af3585c9..db7052ea8d0 100644 --- a/drivers/block/nvme-scsi.c +++ b/drivers/block/nvme-scsi.c @@ -2040,7 +2040,7 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, struct nvme_command c; u8 opcode = (is_write ? nvme_cmd_write : nvme_cmd_read); u16 control; - u32 max_blocks = (dev->max_hw_sectors << 9) >> ns->lba_shift; + u32 max_blocks = nvme_block_nr(ns, dev->max_hw_sectors); num_cmds = nvme_trans_io_get_num_cmds(hdr, cdb_info, max_blocks); diff --git a/include/linux/nvme.h b/include/linux/nvme.h index aa575033dbe..09f419d4da4 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -566,6 +566,11 @@ struct nvme_iod { struct scatterlist sg[0]; }; +static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector) +{ + return (sector >> (ns->lba_shift - 9)); +} + /** * nvme_free_iod - frees an nvme_iod * @dev: The device that the I/O was submitted to -- cgit v1.2.3 From 422ef0c7c81934c3344e1623d2dcfda5fa2fab8d Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 16 Apr 2013 11:22:36 -0400 Subject: NVMe: Don't fail initialisation unnecessarily The nvme_dev_add() function currently returns the last error code that it saw, which (if everything else succeeds) happens to be the result of an optional command, so it can legitimately fail. Looking at the error path more closely reveals that we should return success from this function, even if no device namespaces are added. So once the queues are created and the device has responded to Identify, make sure that this function succeeds. Signed-off-by: Matthew Wilcox Acked-by: Keith Busch --- drivers/block/nvme-core.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index f3ea52aa3e5..357c961151a 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1502,6 +1502,12 @@ static void nvme_free_queues(struct nvme_dev *dev) nvme_free_queue(dev, i); } +/* + * Return: error value if an error occurred setting up the queues or calling + * Identify Device. 0 if these succeeded, even if adding some of the + * namespaces failed. At the moment, these failures are silent. TBD which + * failures should be reported. + */ static int nvme_dev_add(struct nvme_dev *dev) { int res, nn, i; @@ -1555,7 +1561,7 @@ static int nvme_dev_add(struct nvme_dev *dev) } list_for_each_entry(ns, &dev->namespaces, list) add_disk(ns->disk); - + res = 0; goto out; out_free: -- cgit v1.2.3 From 8741ee4cb6268fdb1bf22a396382004e97210f38 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Thu, 4 Apr 2013 17:52:27 -0600 Subject: NVMe: Fix sparse warnings in scsi emulation Sparse produced warnings for some instances of mismatched types and direct userspace dereferences. This patch fixes those for the scsi emulation layer. Signed-off-by: Vishal Verma Signed-off-by: Matthew Wilcox --- drivers/block/nvme-scsi.c | 99 ++++++++++++++++++++++++++++------------------- 1 file changed, 60 insertions(+), 39 deletions(-) diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c index db7052ea8d0..bbfb2883f05 100644 --- a/drivers/block/nvme-scsi.c +++ b/drivers/block/nvme-scsi.c @@ -380,11 +380,16 @@ static int nvme_trans_copy_to_user(struct sg_io_hdr *hdr, void *from, size_t xfer_len; if (hdr->iovec_count > 0) { - struct sg_iovec *sgl = hdr->dxferp; + struct sg_iovec sgl; for (i = 0; i < hdr->iovec_count; i++) { - xfer_len = min(remaining, sgl[i].iov_len); - not_copied = copy_to_user(__user sgl[i].iov_base, index, + not_copied = copy_from_user(&sgl, hdr->dxferp + + i * sizeof(struct sg_iovec), + sizeof(struct sg_iovec)); + if (not_copied) + return -EFAULT; + xfer_len = min(remaining, sgl.iov_len); + not_copied = copy_to_user(sgl.iov_base, index, xfer_len); if (not_copied) { res = -EFAULT; @@ -397,7 +402,7 @@ static int nvme_trans_copy_to_user(struct sg_io_hdr *hdr, void *from, } return res; } - not_copied = copy_to_user(__user hdr->dxferp, from, n); + not_copied = copy_to_user(hdr->dxferp, from, n); if (not_copied) res = -EFAULT; return res; @@ -416,12 +421,17 @@ static int nvme_trans_copy_from_user(struct sg_io_hdr *hdr, void *to, size_t xfer_len; if (hdr->iovec_count > 0) { - struct sg_iovec *sgl = hdr->dxferp; + struct sg_iovec sgl; for (i = 0; i < hdr->iovec_count; i++) { - xfer_len = min(remaining, sgl[i].iov_len); - not_copied = copy_from_user(index, - __user sgl[i].iov_base, xfer_len); + not_copied = copy_from_user(&sgl, hdr->dxferp + + i * sizeof(struct sg_iovec), + sizeof(struct sg_iovec)); + if (not_copied) + return -EFAULT; + xfer_len = min(remaining, sgl.iov_len); + not_copied = copy_from_user(index, sgl.iov_base, + xfer_len); if (not_copied) { res = -EFAULT; break; @@ -434,7 +444,7 @@ static int nvme_trans_copy_from_user(struct sg_io_hdr *hdr, void *to, return res; } - not_copied = copy_from_user(to, __user hdr->dxferp, n); + not_copied = copy_from_user(to, hdr->dxferp, n); if (not_copied) res = -EFAULT; return res; @@ -469,7 +479,7 @@ static int nvme_trans_completion(struct sg_io_hdr *hdr, u8 status, u8 sense_key, xfer_len = min_t(u8, hdr->mx_sb_len, DESC_FMT_SENSE_DATA_SIZE); hdr->sb_len_wr = xfer_len; - if (copy_to_user(__user hdr->sbp, resp, xfer_len) > 0) + if (copy_to_user(hdr->sbp, resp, xfer_len) > 0) res = -EFAULT; } @@ -774,7 +784,7 @@ static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, int nvme_sc; u8 ieee[4]; int xfer_len; - u32 tmp_id = cpu_to_be64(ns->ns_id); + __be32 tmp_id = cpu_to_be32(ns->ns_id); mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), &dma_addr, GFP_KERNEL); @@ -1190,15 +1200,15 @@ static int nvme_trans_fill_blk_desc(struct nvme_ns *ns, struct sg_io_hdr *hdr, lba_length = (1 << (id_ns->lbaf[flbas].ds)); if (llbaa == 0) { - u32 tmp_cap = cpu_to_be32(id_ns->ncap); + __be32 tmp_cap = cpu_to_be32(le64_to_cpu(id_ns->ncap)); /* Byte 4 is reserved */ - u32 tmp_len = cpu_to_be32(lba_length) & 0x00FFFFFF; + __be32 tmp_len = cpu_to_be32(lba_length & 0x00FFFFFF); memcpy(resp, &tmp_cap, sizeof(u32)); memcpy(&resp[4], &tmp_len, sizeof(u32)); } else { - u64 tmp_cap = cpu_to_be64(id_ns->ncap); - u32 tmp_len = cpu_to_be32(lba_length); + __be64 tmp_cap = cpu_to_be64(le64_to_cpu(id_ns->ncap)); + __be32 tmp_len = cpu_to_be32(lba_length); memcpy(resp, &tmp_cap, sizeof(u64)); /* Bytes 8, 9, 10, 11 are reserved */ @@ -1412,9 +1422,9 @@ static void nvme_trans_fill_read_cap(u8 *response, struct nvme_id_ns *id_ns, u64 rlba; u8 prot_en; u8 p_type_lut[4] = {0, 0, 1, 2}; - u64 tmp_rlba; - u32 tmp_rlba_32; - u32 tmp_len; + __be64 tmp_rlba; + __be32 tmp_rlba_32; + __be32 tmp_len; flbas = (id_ns->flbas) & 0x0F; lba_length = (1 << (id_ns->lbaf[flbas].ds)); @@ -1563,12 +1573,13 @@ static int nvme_trans_send_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, goto out_unmap; } - c.dlfw.numd = (tot_len/BYTES_TO_DWORDS) - 1; - c.dlfw.offset = offset/BYTES_TO_DWORDS; + c.dlfw.numd = cpu_to_le32((tot_len/BYTES_TO_DWORDS) - 1); + c.dlfw.offset = cpu_to_le32(offset/BYTES_TO_DWORDS); } else if (opcode == nvme_admin_activate_fw) { - c.common.cdw10[0] = buffer_id; + c.common.cdw10[0] = cpu_to_le32(buffer_id); /* AA=01b Replace & activate at reset */ - c.common.cdw10[0] |= 0x00000008; + c.common.cdw10[0] = cpu_to_le32(le32_to_cpu( + c.common.cdw10[0]) | 0x00000008); } nvme_sc = nvme_submit_admin_cmd(dev, &c, NULL); @@ -1800,7 +1811,7 @@ static int nvme_trans_fmt_set_blk_size_count(struct nvme_ns *ns, id_ns = mem; if (ns->mode_select_num_blocks == 0) - ns->mode_select_num_blocks = id_ns->ncap; + ns->mode_select_num_blocks = le64_to_cpu(id_ns->ncap); if (ns->mode_select_block_len == 0) { flbas = (id_ns->flbas) & 0x0F; ns->mode_select_block_len = @@ -1920,7 +1931,7 @@ static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, ILLEGAL_REQUEST, SCSI_ASC_INVALID_PARAMETER, SCSI_ASCQ_CAUSE_NOT_REPORTABLE); } - if (ns->mode_select_num_blocks != id_ns->ncap) { + if (ns->mode_select_num_blocks != le64_to_cpu(id_ns->ncap)) { res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, ILLEGAL_REQUEST, SCSI_ASC_INVALID_PARAMETER, SCSI_ASCQ_CAUSE_NOT_REPORTABLE); @@ -1930,7 +1941,7 @@ static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, cdw10 |= selected_lbaf & 0x0F; memset(&c, 0, sizeof(c)); c.format.opcode = nvme_admin_format_nvm; - c.format.nsid = ns->ns_id; + c.format.nsid = cpu_to_le32(ns->ns_id); c.format.cdw10 = cpu_to_le32(cdw10); nvme_sc = nvme_submit_admin_cmd(dev, &c, NULL); @@ -2036,7 +2047,7 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, u32 retcode; u32 i = 0; u64 nvme_offset = 0; - void *next_mapping_addr; + void __user *next_mapping_addr; struct nvme_command c; u8 opcode = (is_write ? nvme_cmd_write : nvme_cmd_read); u16 control; @@ -2056,11 +2067,16 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, for (i = 0; i < num_cmds; i++) { memset(&c, 0, sizeof(c)); if (hdr->iovec_count > 0) { - struct sg_iovec *sgl = hdr->dxferp; - - unit_len = sgl[i].iov_len; + struct sg_iovec sgl; + + retcode = copy_from_user(&sgl, hdr->dxferp + + i * sizeof(struct sg_iovec), + sizeof(struct sg_iovec)); + if (retcode) + return -EFAULT; + unit_len = sgl.iov_len; unit_num_blocks = unit_len >> ns->lba_shift; - next_mapping_addr = sgl[i].iov_base; + next_mapping_addr = sgl.iov_base; } else { unit_num_blocks = min((u64)max_blocks, (cdb_info->xfer_len - nvme_offset)); @@ -2138,8 +2154,9 @@ static int nvme_trans_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 is_write, u8 opcode = cmd[0]; u64 xfer_bytes; u64 sum_iov_len = 0; - struct sg_iovec *sgl; + struct sg_iovec sgl; int i; + size_t not_copied; /* Extract Fields from CDB */ switch (opcode) { @@ -2167,11 +2184,15 @@ static int nvme_trans_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 is_write, /* Calculate total length of transfer (in bytes) */ if (hdr->iovec_count > 0) { - sgl = hdr->dxferp; for (i = 0; i < hdr->iovec_count; i++) { - sum_iov_len += sgl[i].iov_len; + not_copied = copy_from_user(&sgl, hdr->dxferp + + i * sizeof(struct sg_iovec), + sizeof(struct sg_iovec)); + if (not_copied) + return -EFAULT; + sum_iov_len += sgl.iov_len; /* IO vector sizes should be multiples of block size */ - if (sgl[i].iov_len % (1 << ns->lba_shift) != 0) { + if (sgl.iov_len % (1 << ns->lba_shift) != 0) { res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, ILLEGAL_REQUEST, @@ -2494,7 +2515,7 @@ static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr, struct nvme_id_ctrl *id_ctrl; u32 ll_length, lun_id; u8 lun_id_offset = REPORT_LUNS_FIRST_LUN_OFFSET; - u32 tmp_len; + __be32 tmp_len; alloc_len = GET_REPORT_LUNS_ALLOC_LENGTH(cmd); select_report = GET_U8_FROM_CDB(cmd, REPORT_LUNS_SR_OFFSET); @@ -2524,7 +2545,7 @@ static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr, goto out_dma; } id_ctrl = mem; - ll_length = id_ctrl->nn * LUN_ENTRY_SIZE; + ll_length = le32_to_cpu(id_ctrl->nn) * LUN_ENTRY_SIZE; resp_size = ll_length + LUN_DATA_HEADER_SIZE; if (alloc_len < resp_size) { @@ -2543,12 +2564,12 @@ static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr, memset(response, 0, resp_size); /* The first LUN ID will always be 0 per the SAM spec */ - for (lun_id = 0; lun_id < id_ctrl->nn; lun_id++) { + for (lun_id = 0; lun_id < le32_to_cpu(id_ctrl->nn); lun_id++) { /* * Set the LUN Id and then increment to the next LUN * location in the parameter data. */ - u64 tmp_id = cpu_to_be64(lun_id); + __be64 tmp_id = cpu_to_be64(lun_id); memcpy(&response[lun_id_offset], &tmp_id, sizeof(u64)); lun_id_offset += LUN_ENTRY_SIZE; } @@ -2929,7 +2950,7 @@ int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr) return retcode; if (retcode > 0) retcode = SNTI_TRANSLATION_SUCCESS; - if (copy_to_user(__user u_hdr, &hdr, sizeof(sg_io_hdr_t)) > 0) + if (copy_to_user(u_hdr, &hdr, sizeof(sg_io_hdr_t)) > 0) return -EFAULT; return retcode; -- cgit v1.2.3 From af2d9ca744af5e03390eeb3864d08ce75c860899 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 16 Apr 2013 15:18:30 -0400 Subject: NVMe: Fix I/O cancellation status on big-endian machines The sparse bitwise checks pointed out that I needed to shift the status before changing its endianness, not after. Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 357c961151a..a3c7d504d5e 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -875,7 +875,7 @@ static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout) void *ctx; nvme_completion_fn fn; static struct nvme_completion cqe = { - .status = cpu_to_le16(NVME_SC_ABORT_REQ) << 1, + .status = cpu_to_le16(NVME_SC_ABORT_REQ << 1), }; if (timeout && !time_after(now, info[cmdid].timeout)) -- cgit v1.2.3 From 1c9b52651dad0ff1fa71fc6205c86d972f25bcc0 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 16 Apr 2013 15:21:06 -0400 Subject: NVMe: Fix endian-related problems in user I/O submission path When constructing the command, dsmgmt needs to be treated as a 32-bit value, not a 16-bit value. reftag, apptag and appmask all need to be converted from native-endian to little-endian. Again, sparse's bitwise warnings caught this problem. Thanks to Keith for pointing out the correct way to fix the reftag. Signed-off-by: Matthew Wilcox Acked-by: Keith Busch --- drivers/block/nvme-core.c | 8 ++++---- include/linux/nvme.h | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index a3c7d504d5e..dbd2103533c 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1166,10 +1166,10 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) c.rw.slba = cpu_to_le64(io.slba); c.rw.length = cpu_to_le16(io.nblocks); c.rw.control = cpu_to_le16(io.control); - c.rw.dsmgmt = cpu_to_le16(io.dsmgmt); - c.rw.reftag = io.reftag; - c.rw.apptag = io.apptag; - c.rw.appmask = io.appmask; + c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); + c.rw.reftag = cpu_to_le32(io.reftag); + c.rw.apptag = cpu_to_le16(io.apptag); + c.rw.appmask = cpu_to_le16(io.appmask); /* XXX: metadata */ length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL); diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 09f419d4da4..7ae7ecfc094 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -207,11 +207,11 @@ struct nvme_common_command { __u8 flags; __u16 command_id; __le32 nsid; - __u32 cdw2[2]; + __le32 cdw2[2]; __le64 metadata; __le64 prp1; __le64 prp2; - __u32 cdw10[6]; + __le32 cdw10[6]; }; struct nvme_rw_command { -- cgit v1.2.3 From 5e82e952f04681c10f35e02ee0a4a43ec027137a Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 19 Feb 2013 10:17:58 -0700 Subject: NVMe: Add a character device for each nvme device Registers a miscellaneous device for each nvme controller probed. This creates character device files as /dev/nvmeN, where N is the device instance, and supports nvme admin ioctl commands so devices without namespaces can be managed. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 74 ++++++++++++++++++++++++++++++++++++++++------- include/linux/nvme.h | 5 ++++ 2 files changed, 69 insertions(+), 10 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index dbd2103533c..40908a06bd5 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1647,6 +1647,56 @@ static void nvme_release_instance(struct nvme_dev *dev) spin_unlock(&dev_list_lock); } +static void nvme_free_dev(struct kref *kref) +{ + struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref); + nvme_dev_remove(dev); + pci_disable_msix(dev->pci_dev); + iounmap(dev->bar); + nvme_release_instance(dev); + nvme_release_prp_pools(dev); + pci_disable_device(dev->pci_dev); + pci_release_regions(dev->pci_dev); + kfree(dev->queues); + kfree(dev->entry); + kfree(dev); +} + +static int nvme_dev_open(struct inode *inode, struct file *f) +{ + struct nvme_dev *dev = container_of(f->private_data, struct nvme_dev, + miscdev); + kref_get(&dev->kref); + f->private_data = dev; + return 0; +} + +static int nvme_dev_release(struct inode *inode, struct file *f) +{ + struct nvme_dev *dev = f->private_data; + kref_put(&dev->kref, nvme_free_dev); + return 0; +} + +static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) +{ + struct nvme_dev *dev = f->private_data; + switch (cmd) { + case NVME_IOCTL_ADMIN_CMD: + return nvme_user_admin_cmd(dev, (void __user *)arg); + default: + return -ENOTTY; + } +} + +static const struct file_operations nvme_dev_fops = { + .owner = THIS_MODULE, + .open = nvme_dev_open, + .release = nvme_dev_release, + .unlocked_ioctl = nvme_dev_ioctl, + .compat_ioctl = nvme_dev_ioctl, +}; + static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) { int bars, result = -ENOMEM; @@ -1705,8 +1755,20 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result) goto delete; + scnprintf(dev->name, sizeof(dev->name), "nvme%d", dev->instance); + dev->miscdev.minor = MISC_DYNAMIC_MINOR; + dev->miscdev.parent = &pdev->dev; + dev->miscdev.name = dev->name; + dev->miscdev.fops = &nvme_dev_fops; + result = misc_register(&dev->miscdev); + if (result) + goto remove; + + kref_init(&dev->kref); return 0; + remove: + nvme_dev_remove(dev); delete: spin_lock(&dev_list_lock); list_del(&dev->node); @@ -1732,16 +1794,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) static void nvme_remove(struct pci_dev *pdev) { struct nvme_dev *dev = pci_get_drvdata(pdev); - nvme_dev_remove(dev); - pci_disable_msix(pdev); - iounmap(dev->bar); - nvme_release_instance(dev); - nvme_release_prp_pools(dev); - pci_disable_device(pdev); - pci_release_regions(pdev); - kfree(dev->queues); - kfree(dev->entry); - kfree(dev); + misc_deregister(&dev->miscdev); + kref_put(&dev->kref, nvme_free_dev); } /* These functions are yet to be implemented */ diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 7ae7ecfc094..9b6fba872f4 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -507,6 +507,8 @@ struct nvme_admin_cmd { #ifdef __KERNEL__ #include +#include +#include #define NVME_IO_TIMEOUT (5 * HZ) @@ -527,6 +529,9 @@ struct nvme_dev { struct msix_entry *entry; struct nvme_bar __iomem *bar; struct list_head namespaces; + struct kref kref; + struct miscdevice miscdev; + char name[12]; char serial[20]; char model[40]; char firmware_rev[8]; -- cgit v1.2.3 From 564a232c059913d91b491e04c2b2d670b8f94615 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 1 May 2013 16:38:23 -0400 Subject: NVMe: Set TASK_INTERRUPTIBLE before processing queues The kthread has two tasks; handling timeouts (for which it runs once per second), and submitting queued BIOs. If a BIO happens to be queued after the thread has processed the queue but before it calls schedule_timeout(), the thread will sleep for a second before submitting it, which can cause performance problems in some rare cases (that will become more common in a subsequent patch). Signed-off-by: Arjan van de Ven Tested-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 40908a06bd5..358d17700c2 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1291,7 +1291,7 @@ static int nvme_kthread(void *data) struct nvme_dev *dev; while (!kthread_should_stop()) { - __set_current_state(TASK_RUNNING); + set_current_state(TASK_INTERRUPTIBLE); spin_lock(&dev_list_lock); list_for_each_entry(dev, &dev_list, node) { int i; @@ -1308,7 +1308,6 @@ static int nvme_kthread(void *data) } } spin_unlock(&dev_list_lock); - set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(round_jiffies_relative(HZ)); } return 0; -- cgit v1.2.3 From 14385de117882fa253954063d8b8d8a6bdeae650 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 25 Apr 2013 14:39:27 -0600 Subject: NVMe: queue usage fixes in nvme-scsi Fixes nvme queue usages in scsi-to-nvme translation code to not get a queue more often than it is being put, and not use the queue in an unsafe way without it being locked. Signed-off-by: Keith Busch Acked-by: Vishal Verma Signed-off-by: Matthew Wilcox --- drivers/block/nvme-scsi.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c index bbfb2883f05..fe2fcfff4c0 100644 --- a/drivers/block/nvme-scsi.c +++ b/drivers/block/nvme-scsi.c @@ -2039,7 +2039,7 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, int res = SNTI_TRANSLATION_SUCCESS; int nvme_sc; struct nvme_dev *dev = ns->dev; - struct nvme_queue *nvmeq = get_nvmeq(ns->dev); + struct nvme_queue *nvmeq; u32 num_cmds; struct nvme_iod *iod; u64 unit_len; @@ -2653,7 +2653,8 @@ static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr, { int res = SNTI_TRANSLATION_SUCCESS; int nvme_sc; - struct nvme_queue *nvmeq = get_nvmeq(ns->dev); + struct nvme_queue *nvmeq; + struct nvme_command c; u8 immed, pcmod, pc, no_flush, start; immed = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_IMMED_OFFSET); @@ -2675,8 +2676,14 @@ static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr, } else { if (no_flush == 0) { /* Issue NVME FLUSH command prior to START STOP UNIT */ - nvme_sc = nvme_submit_flush_data(nvmeq, ns); + memset(&c, 0, sizeof(c)); + c.common.opcode = nvme_cmd_flush; + c.common.nsid = cpu_to_le32(ns->ns_id); + + nvmeq = get_nvmeq(ns->dev); put_nvmeq(nvmeq); + nvme_sc = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT); + res = nvme_trans_status_code(hdr, nvme_sc); if (res) goto out; @@ -2698,9 +2705,17 @@ static int nvme_trans_synchronize_cache(struct nvme_ns *ns, { int res = SNTI_TRANSLATION_SUCCESS; int nvme_sc; - struct nvme_queue *nvmeq = get_nvmeq(ns->dev); + struct nvme_command c; + struct nvme_queue *nvmeq; + + memset(&c, 0, sizeof(c)); + c.common.opcode = nvme_cmd_flush; + c.common.nsid = cpu_to_le32(ns->ns_id); + + nvmeq = get_nvmeq(ns->dev); put_nvmeq(nvmeq); - nvme_sc = nvme_submit_flush_data(nvmeq, ns); + nvme_sc = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT); + res = nvme_trans_status_code(hdr, nvme_sc); if (res) goto out; -- cgit v1.2.3 From ec5037335064dcc52c2fbbf3d505bae0eb27e713 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 24 Apr 2013 15:44:24 -0600 Subject: NVMe: Add scsi unmap to SG_IO Translates a scsi unmap request from SG_IO ioctl to NVMe data-set-management deallocate. Signed-off-by: Keith Busch Acked-by: Vishal Verma Signed-off-by: Matthew Wilcox --- drivers/block/nvme-scsi.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c index fe2fcfff4c0..1c0710ca55d 100644 --- a/drivers/block/nvme-scsi.c +++ b/drivers/block/nvme-scsi.c @@ -100,6 +100,7 @@ static int sg_version_num = 30534; /* 2 digits for each component */ #define FORMAT_UNIT_PROT_INT_OFFSET 3 #define FORMAT_UNIT_PROT_FIELD_USAGE_OFFSET 0 #define FORMAT_UNIT_PROT_FIELD_USAGE_MASK 0x07 +#define UNMAP_CDB_PARAM_LIST_LENGTH_OFFSET 7 /* Misc. defines */ #define NIBBLE_SHIFT 4 @@ -2862,6 +2863,80 @@ static int nvme_trans_write_buffer(struct nvme_ns *ns, struct sg_io_hdr *hdr, return res; } +struct scsi_unmap_blk_desc { + __be64 slba; + __be32 nlb; + u32 resv; +}; + +struct scsi_unmap_parm_list { + __be16 unmap_data_len; + __be16 unmap_blk_desc_data_len; + u32 resv; + struct scsi_unmap_blk_desc desc[0]; +}; + +static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + struct nvme_dev *dev = ns->dev; + struct scsi_unmap_parm_list *plist; + struct nvme_dsm_range *range; + struct nvme_queue *nvmeq; + struct nvme_command c; + int i, nvme_sc, res = -ENOMEM; + u16 ndesc, list_len; + dma_addr_t dma_addr; + + list_len = GET_U16_FROM_CDB(cmd, UNMAP_CDB_PARAM_LIST_LENGTH_OFFSET); + if (!list_len) + return -EINVAL; + + plist = kmalloc(list_len, GFP_KERNEL); + if (!plist) + return -ENOMEM; + + res = nvme_trans_copy_from_user(hdr, plist, list_len); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out; + + ndesc = be16_to_cpu(plist->unmap_blk_desc_data_len) >> 4; + if (!ndesc || ndesc > 256) { + res = -EINVAL; + goto out; + } + + range = dma_alloc_coherent(&dev->pci_dev->dev, ndesc * sizeof(*range), + &dma_addr, GFP_KERNEL); + if (!range) + goto out; + + for (i = 0; i < ndesc; i++) { + range[i].nlb = cpu_to_le32(be32_to_cpu(plist->desc[i].nlb)); + range[i].slba = cpu_to_le64(be64_to_cpu(plist->desc[i].slba)); + range[i].cattr = 0; + } + + memset(&c, 0, sizeof(c)); + c.dsm.opcode = nvme_cmd_dsm; + c.dsm.nsid = cpu_to_le32(ns->ns_id); + c.dsm.prp1 = cpu_to_le64(dma_addr); + c.dsm.nr = cpu_to_le32(ndesc - 1); + c.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); + + nvmeq = get_nvmeq(dev); + put_nvmeq(nvmeq); + + nvme_sc = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT); + res = nvme_trans_status_code(hdr, nvme_sc); + + dma_free_coherent(&dev->pci_dev->dev, ndesc * sizeof(*range), + range, dma_addr); + out: + kfree(plist); + return res; +} + static int nvme_scsi_translate(struct nvme_ns *ns, struct sg_io_hdr *hdr) { u8 cmd[BLK_MAX_CDB]; @@ -2936,6 +3011,9 @@ static int nvme_scsi_translate(struct nvme_ns *ns, struct sg_io_hdr *hdr) case WRITE_BUFFER: retcode = nvme_trans_write_buffer(ns, hdr, cmd); break; + case UNMAP: + retcode = nvme_trans_unmap(ns, hdr, cmd); + break; default: out: retcode = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, -- cgit v1.2.3 From 025c557a71bd06a9f6d32259f00e2218b15bf0a4 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 1 May 2013 13:07:51 -0600 Subject: NVMe: Free admin queue on request_irq error Fixes a potential memory leak if requesting the admin queue irq fails. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 358d17700c2..391a874e413 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1054,14 +1054,19 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) } } - if (result) { - nvme_free_queue_mem(nvmeq); - return result; - } + if (result) + goto free_q; result = queue_request_irq(dev, nvmeq, "nvme admin"); + if (result) + goto free_q; + dev->queues[0] = nvmeq; return result; + + free_q: + nvme_free_queue_mem(nvmeq); + return result; } struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, -- cgit v1.2.3 From 68b8eca5f882230e4991df0db0a3094f49420a90 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 1 May 2013 13:07:47 -0600 Subject: NVMe: Fix error clean-up on nvme_alloc_queue The nvme_queue's depth is not set if we fail to allocate submission queue entries, which was being used to determine how much coherent memory to free on error. Use the depth variable instead. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 391a874e413..388c54d8480 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -956,7 +956,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, return nvmeq; free_cqdma: - dma_free_coherent(dmadev, CQ_SIZE(nvmeq->q_depth), (void *)nvmeq->cqes, + dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes, nvmeq->cq_dma_addr); free_nvmeq: kfree(nvmeq); -- cgit v1.2.3 From a9ef4343afbe67a6abf83c0f0294e80db48e513a Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 1 May 2013 13:07:48 -0600 Subject: NVMe: Check for NULL memory in nvme_dev_add Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 388c54d8480..2d4f2ae36d3 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1527,6 +1527,8 @@ static int nvme_dev_add(struct nvme_dev *dev) mem = dma_alloc_coherent(&dev->pci_dev->dev, 8192, &dma_addr, GFP_KERNEL); + if (!mem) + return -ENOMEM; res = nvme_identify(dev, 0, 1, dma_addr); if (res) { -- cgit v1.2.3 From cbb6218fd4ae8f98ddf0d66f0826a7a3a9c88298 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 1 May 2013 13:07:49 -0600 Subject: NVMe: Remove dead code in nvme_dev_add There is no situation that could occur where we could error out of this function and require cleaning up allocated namespaces. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 2d4f2ae36d3..a126c7b9dbe 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1515,7 +1515,7 @@ static void nvme_free_queues(struct nvme_dev *dev) static int nvme_dev_add(struct nvme_dev *dev) { int res, nn, i; - struct nvme_ns *ns, *next; + struct nvme_ns *ns; struct nvme_id_ctrl *ctrl; struct nvme_id_ns *id_ns; void *mem; @@ -1533,7 +1533,7 @@ static int nvme_dev_add(struct nvme_dev *dev) res = nvme_identify(dev, 0, 1, dma_addr); if (res) { res = -EIO; - goto out_free; + goto out; } ctrl = mem; @@ -1568,13 +1568,6 @@ static int nvme_dev_add(struct nvme_dev *dev) list_for_each_entry(ns, &dev->namespaces, list) add_disk(ns->disk); res = 0; - goto out; - - out_free: - list_for_each_entry_safe(ns, next, &dev->namespaces, list) { - list_del(&ns->list); - nvme_ns_free(ns); - } out: dma_free_coherent(&dev->pci_dev->dev, 8192, mem, dma_addr); -- cgit v1.2.3 From 427e97080196548557b288517537ab7eb48c309f Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 9 Apr 2013 11:59:32 -0600 Subject: NVMe: Split non-mergeable bio requests It is possible a bio request can not be submitted as a single NVMe IO command if the bio_vec is not mergeable with the NVMe PRP alignement constraints. This condition was handled by submitting an IO for the mergeable portion then submitting a follow on IO for the remaining data after the previous IO completes. The remainder to be sent was tracked by manipulating the bio->bi_idx and bio->bi_sector. This patch splits the request as many times as necessary and submits the bios together. Since submitting the bio may cause it to be requeued on split, nvme_resubmit_bios had to be modified to remove the wait queue when the bio list is empty prior to submitting the bio since a split would have added the wait queue a second time, corrupting the wait queue head task list. There are a few other benefits from doing this: it fixes a potential issue with the previous handling of a non-mergeable bio as the requeuing method could would use an unlocked nvme_queue if the callback isn't invoked on the queue's associated cpu; it will be possible to retry a failed bio if desired at some later time since it does not manipulate the original bio; the bio integrity extensions require the bio to be in its original condition for the checks to work correctly if we implement the end-to-end data protection in the future. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 152 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 122 insertions(+), 30 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index a126c7b9dbe..f81280663a4 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -308,16 +308,6 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) kfree(iod); } -static void requeue_bio(struct nvme_dev *dev, struct bio *bio) -{ - struct nvme_queue *nvmeq = get_nvmeq(dev); - if (bio_list_empty(&nvmeq->sq_cong)) - add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); - bio_list_add(&nvmeq->sq_cong, bio); - put_nvmeq(nvmeq); - wake_up_process(nvme_thread); -} - static void bio_completion(struct nvme_dev *dev, void *ctx, struct nvme_completion *cqe) { @@ -329,13 +319,10 @@ static void bio_completion(struct nvme_dev *dev, void *ctx, dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); nvme_free_iod(dev, iod); - if (status) { + if (status) bio_endio(bio, -EIO); - } else if (bio->bi_vcnt > bio->bi_idx) { - requeue_bio(dev, bio); - } else { + else bio_endio(bio, 0); - } } /* length is in bytes. gfp flags indicates whether we may sleep. */ @@ -419,25 +406,130 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd, return total_len; } +struct nvme_bio_pair { + struct bio b1, b2, *parent; + struct bio_vec *bv1, *bv2; + int err; + atomic_t cnt; +}; + +static void nvme_bio_pair_endio(struct bio *bio, int err) +{ + struct nvme_bio_pair *bp = bio->bi_private; + + if (err) + bp->err = err; + + if (atomic_dec_and_test(&bp->cnt)) { + bio_endio(bp->parent, bp->err); + if (bp->bv1) + kfree(bp->bv1); + if (bp->bv2) + kfree(bp->bv2); + kfree(bp); + } +} + +static struct nvme_bio_pair *nvme_bio_split(struct bio *bio, int idx, + int len, int offset) +{ + struct nvme_bio_pair *bp; + + BUG_ON(len > bio->bi_size); + BUG_ON(idx > bio->bi_vcnt); + + bp = kmalloc(sizeof(*bp), GFP_ATOMIC); + if (!bp) + return NULL; + bp->err = 0; + + bp->b1 = *bio; + bp->b2 = *bio; + + bp->b1.bi_size = len; + bp->b2.bi_size -= len; + bp->b1.bi_vcnt = idx; + bp->b2.bi_idx = idx; + bp->b2.bi_sector += len >> 9; + + if (offset) { + bp->bv1 = kmalloc(bio->bi_max_vecs * sizeof(struct bio_vec), + GFP_ATOMIC); + if (!bp->bv1) + goto split_fail_1; + + bp->bv2 = kmalloc(bio->bi_max_vecs * sizeof(struct bio_vec), + GFP_ATOMIC); + if (!bp->bv2) + goto split_fail_2; + + memcpy(bp->bv1, bio->bi_io_vec, + bio->bi_max_vecs * sizeof(struct bio_vec)); + memcpy(bp->bv2, bio->bi_io_vec, + bio->bi_max_vecs * sizeof(struct bio_vec)); + + bp->b1.bi_io_vec = bp->bv1; + bp->b2.bi_io_vec = bp->bv2; + bp->b2.bi_io_vec[idx].bv_offset += offset; + bp->b2.bi_io_vec[idx].bv_len -= offset; + bp->b1.bi_io_vec[idx].bv_len = offset; + bp->b1.bi_vcnt++; + } else + bp->bv1 = bp->bv2 = NULL; + + bp->b1.bi_private = bp; + bp->b2.bi_private = bp; + + bp->b1.bi_end_io = nvme_bio_pair_endio; + bp->b2.bi_end_io = nvme_bio_pair_endio; + + bp->parent = bio; + atomic_set(&bp->cnt, 2); + + return bp; + + split_fail_2: + kfree(bp->bv1); + split_fail_1: + kfree(bp); + return NULL; +} + +static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq, + int idx, int len, int offset) +{ + struct nvme_bio_pair *bp = nvme_bio_split(bio, idx, len, offset); + if (!bp) + return -ENOMEM; + + if (bio_list_empty(&nvmeq->sq_cong)) + add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); + bio_list_add(&nvmeq->sq_cong, &bp->b1); + bio_list_add(&nvmeq->sq_cong, &bp->b2); + + return 0; +} + /* NVMe scatterlists require no holes in the virtual address */ #define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2) ((vec2)->bv_offset || \ (((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE)) -static int nvme_map_bio(struct device *dev, struct nvme_iod *iod, +static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod, struct bio *bio, enum dma_data_direction dma_dir, int psegs) { struct bio_vec *bvec, *bvprv = NULL; struct scatterlist *sg = NULL; - int i, old_idx, length = 0, nsegs = 0; + int i, length = 0, nsegs = 0; sg_init_table(iod->sg, psegs); - old_idx = bio->bi_idx; bio_for_each_segment(bvec, bio, i) { if (bvprv && BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) { sg->length += bvec->bv_len; } else { if (bvprv && BIOVEC_NOT_VIRT_MERGEABLE(bvprv, bvec)) - break; + return nvme_split_and_submit(bio, nvmeq, i, + length, 0); + sg = sg ? sg + 1 : iod->sg; sg_set_page(sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset); @@ -446,13 +538,11 @@ static int nvme_map_bio(struct device *dev, struct nvme_iod *iod, length += bvec->bv_len; bvprv = bvec; } - bio->bi_idx = i; iod->nents = nsegs; sg_mark_end(sg); - if (dma_map_sg(dev, iod->sg, iod->nents, dma_dir) == 0) { - bio->bi_idx = old_idx; + if (dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir) == 0) return -ENOMEM; - } + return length; } @@ -581,8 +671,8 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, dma_dir = DMA_FROM_DEVICE; } - result = nvme_map_bio(nvmeq->q_dmadev, iod, bio, dma_dir, psegs); - if (result < 0) + result = nvme_map_bio(nvmeq, iod, bio, dma_dir, psegs); + if (result <= 0) goto free_cmdid; length = result; @@ -595,8 +685,6 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, cmnd->rw.control = cpu_to_le16(control); cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); - bio->bi_sector += length >> 9; - if (++nvmeq->sq_tail == nvmeq->q_depth) nvmeq->sq_tail = 0; writel(nvmeq->sq_tail, nvmeq->q_db); @@ -1281,13 +1369,17 @@ static void nvme_resubmit_bios(struct nvme_queue *nvmeq) while (bio_list_peek(&nvmeq->sq_cong)) { struct bio *bio = bio_list_pop(&nvmeq->sq_cong); struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data; + + if (bio_list_empty(&nvmeq->sq_cong)) + remove_wait_queue(&nvmeq->sq_full, + &nvmeq->sq_cong_wait); if (nvme_submit_bio_queue(nvmeq, ns, bio)) { + if (bio_list_empty(&nvmeq->sq_cong)) + add_wait_queue(&nvmeq->sq_full, + &nvmeq->sq_cong_wait); bio_list_add_head(&nvmeq->sq_cong, bio); break; } - if (bio_list_empty(&nvmeq->sq_cong)) - remove_wait_queue(&nvmeq->sq_full, - &nvmeq->sq_cong_wait); } } -- cgit v1.2.3 From 159b67d7aefe3902df91075be5d80943c1570aa8 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 9 Apr 2013 17:13:20 -0600 Subject: NVMe: Device specific stripe size handling We have an nvme device that has a concept of a stripe size. IO requests that do not transfer data crossing a stripe boundary has greater performance compared to IO that does cross it. This patch sets the stripe size for the device if the device and vendor ids match one with this feature and splits IO requests that cross the stripe boundary. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 19 +++++++++++++++---- include/linux/nvme.h | 1 + 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index f81280663a4..4151a3d26e2 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -519,7 +519,11 @@ static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod, { struct bio_vec *bvec, *bvprv = NULL; struct scatterlist *sg = NULL; - int i, length = 0, nsegs = 0; + int i, length = 0, nsegs = 0, split_len = bio->bi_size; + + if (nvmeq->dev->stripe_size) + split_len = nvmeq->dev->stripe_size - + ((bio->bi_sector << 9) & (nvmeq->dev->stripe_size - 1)); sg_init_table(iod->sg, psegs); bio_for_each_segment(bvec, bio, i) { @@ -535,6 +539,10 @@ static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod, bvec->bv_offset); nsegs++; } + + if (split_len - length < bvec->bv_len) + return nvme_split_and_submit(bio, nvmeq, i, split_len, + split_len - length); length += bvec->bv_len; bvprv = bvec; } @@ -543,6 +551,7 @@ static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod, if (dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir) == 0) return -ENOMEM; + BUG_ON(length != bio->bi_size); return length; } @@ -1612,6 +1621,7 @@ static int nvme_dev_add(struct nvme_dev *dev) struct nvme_id_ns *id_ns; void *mem; dma_addr_t dma_addr; + int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12; res = nvme_setup_io_queues(dev); if (res) @@ -1634,10 +1644,11 @@ static int nvme_dev_add(struct nvme_dev *dev) memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); - if (ctrl->mdts) { - int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12; + if (ctrl->mdts) dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9); - } + if ((dev->pci_dev->vendor == PCI_VENDOR_ID_INTEL) && + (dev->pci_dev->device == 0x0953) && ctrl->vs[3]) + dev->stripe_size = 1 << (ctrl->vs[3] + shift); id_ns = mem; for (i = 1; i <= nn; i++) { diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 9b6fba872f4..af29b0e0b09 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -536,6 +536,7 @@ struct nvme_dev { char model[40]; char firmware_rev[8]; u32 max_hw_sectors; + u32 stripe_size; u16 oncs; }; -- cgit v1.2.3 From f410c680b5344f1cb63ff011c6ae3d4963f45c30 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 23 Apr 2013 17:23:59 -0600 Subject: NVMe: Meta-data support in NVME_IOCTL_SUBMIT_IO This adds support for namespaces with separate meta-data formats in the submit io ioctl. The meta-data buffer has to be a contiguous, so such a buffer is allocated and the mapped user pages are copied to/from this buffer for write/read commands. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 71 ++++++++++++++++++++++++++++++++++++++++++++--- include/linux/nvme.h | 1 + 2 files changed, 68 insertions(+), 4 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 4151a3d26e2..5a3f2235892 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1240,13 +1240,19 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) struct nvme_queue *nvmeq; struct nvme_user_io io; struct nvme_command c; - unsigned length; - int status; - struct nvme_iod *iod; + unsigned length, meta_len; + int status, i; + struct nvme_iod *iod, *meta_iod = NULL; + dma_addr_t meta_dma_addr; + void *meta, *uninitialized_var(meta_mem); if (copy_from_user(&io, uio, sizeof(io))) return -EFAULT; length = (io.nblocks + 1) << ns->lba_shift; + meta_len = (io.nblocks + 1) * ns->ms; + + if (meta_len && ((io.metadata & 3) || !io.metadata)) + return -EINVAL; switch (io.opcode) { case nvme_cmd_write: @@ -1272,7 +1278,38 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) c.rw.reftag = cpu_to_le32(io.reftag); c.rw.apptag = cpu_to_le16(io.apptag); c.rw.appmask = cpu_to_le16(io.appmask); - /* XXX: metadata */ + + if (meta_len) { + meta_iod = nvme_map_user_pages(dev, io.opcode & 1, io.metadata, meta_len); + if (IS_ERR(meta_iod)) { + status = PTR_ERR(meta_iod); + meta_iod = NULL; + goto unmap; + } + + meta_mem = dma_alloc_coherent(&dev->pci_dev->dev, meta_len, + &meta_dma_addr, GFP_KERNEL); + if (!meta_mem) { + status = -ENOMEM; + goto unmap; + } + + if (io.opcode & 1) { + int meta_offset = 0; + + for (i = 0; i < meta_iod->nents; i++) { + meta = kmap_atomic(sg_page(&meta_iod->sg[i])) + + meta_iod->sg[i].offset; + memcpy(meta_mem + meta_offset, meta, + meta_iod->sg[i].length); + kunmap_atomic(meta); + meta_offset += meta_iod->sg[i].length; + } + } + + c.rw.metadata = cpu_to_le64(meta_dma_addr); + } + length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL); nvmeq = get_nvmeq(dev); @@ -1288,8 +1325,33 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) else status = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT); + if (meta_len) { + if (status == NVME_SC_SUCCESS && !(io.opcode & 1)) { + int meta_offset = 0; + + for (i = 0; i < meta_iod->nents; i++) { + meta = kmap_atomic(sg_page(&meta_iod->sg[i])) + + meta_iod->sg[i].offset; + memcpy(meta, meta_mem + meta_offset, + meta_iod->sg[i].length); + kunmap_atomic(meta); + meta_offset += meta_iod->sg[i].length; + } + } + + dma_free_coherent(&dev->pci_dev->dev, meta_len, meta_mem, + meta_dma_addr); + } + + unmap: nvme_unmap_user_pages(dev, io.opcode & 1, iod); nvme_free_iod(dev, iod); + + if (meta_iod) { + nvme_unmap_user_pages(dev, io.opcode & 1, meta_iod); + nvme_free_iod(dev, meta_iod); + } + return status; } @@ -1486,6 +1548,7 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid, ns->disk = disk; lbaf = id->flbas & 0xf; ns->lba_shift = id->lbaf[lbaf].ds; + ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); if (dev->max_hw_sectors) blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); diff --git a/include/linux/nvme.h b/include/linux/nvme.h index af29b0e0b09..971ef086ed6 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -552,6 +552,7 @@ struct nvme_ns { int ns_id; int lba_shift; + int ms; u64 mode_select_num_blocks; u32 mode_select_block_len; }; -- cgit v1.2.3 From 78f8d2577bd79ce9d62f4d9e6d3b895bd1dd1d1d Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 19 Apr 2013 14:11:06 -0600 Subject: NVMe: Schedule timeout for sync commands Schedule a timeout on sync commands in case the command times out and the device is not being polled for timeouts. This prevents device removal from hanging forever if the device has stopped responding. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 5a3f2235892..a232dfc1cd4 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -828,7 +828,7 @@ int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd, set_current_state(TASK_KILLABLE); nvme_submit_cmd(nvmeq, cmd); - schedule(); + schedule_timeout(timeout); if (cmdinfo.status == -EINTR) { nvme_abort_command(nvmeq, cmdid); -- cgit v1.2.3 From ba47e3865e8023e79b670793a41508222b5f0322 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Sat, 4 May 2013 06:43:16 -0400 Subject: NVMe: Wait for device to acknowledge shutdown A recent update to the specification makes it clear that the host is expected to wait for the device to acknowledge the Enable bit transitioning to 0 as well as waiting for the device to acknowledge a transition to 1. Reported-by: Khosrow Panah Signed-off-by: Matthew Wilcox Reviewed-by: Keith Busch --- drivers/block/nvme-core.c | 65 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 46 insertions(+), 19 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index a232dfc1cd4..2b1b5a74dc7 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1108,15 +1108,57 @@ static struct nvme_queue *nvme_create_queue(struct nvme_dev *dev, int qid, return ERR_PTR(result); } +static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled) +{ + unsigned long timeout; + u32 bit = enabled ? NVME_CSTS_RDY : 0; + + timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; + + while ((readl(&dev->bar->csts) & NVME_CSTS_RDY) != bit) { + msleep(100); + if (fatal_signal_pending(current)) + return -EINTR; + if (time_after(jiffies, timeout)) { + dev_err(&dev->pci_dev->dev, + "Device not ready; aborting initialisation\n"); + return -ENODEV; + } + } + + return 0; +} + +/* + * If the device has been passed off to us in an enabled state, just clear + * the enabled bit. The spec says we should set the 'shutdown notification + * bits', but doing so may cause the device to complete commands to the + * admin queue ... and we don't know what memory that might be pointing at! + */ +static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap) +{ + writel(0, &dev->bar->cc); + return nvme_wait_ready(dev, cap, false); +} + +static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap) +{ + return nvme_wait_ready(dev, cap, true); +} + static int nvme_configure_admin_queue(struct nvme_dev *dev) { - int result = 0; + int result; u32 aqa; - u64 cap; - unsigned long timeout; + u64 cap = readq(&dev->bar->cap); struct nvme_queue *nvmeq; dev->dbs = ((void __iomem *)dev->bar) + 4096; + dev->db_stride = NVME_CAP_STRIDE(cap); + + result = nvme_disable_ctrl(dev, cap); + if (result < 0) + return result; nvmeq = nvme_alloc_queue(dev, 0, 64, 0); if (!nvmeq) @@ -1130,27 +1172,12 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; - writel(0, &dev->bar->cc); writel(aqa, &dev->bar->aqa); writeq(nvmeq->sq_dma_addr, &dev->bar->asq); writeq(nvmeq->cq_dma_addr, &dev->bar->acq); writel(dev->ctrl_config, &dev->bar->cc); - cap = readq(&dev->bar->cap); - timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; - dev->db_stride = NVME_CAP_STRIDE(cap); - - while (!result && !(readl(&dev->bar->csts) & NVME_CSTS_RDY)) { - msleep(100); - if (fatal_signal_pending(current)) - result = -EINTR; - if (time_after(jiffies, timeout)) { - dev_err(&dev->pci_dev->dev, - "Device not ready; aborting initialisation\n"); - result = -ENODEV; - } - } - + result = nvme_enable_ctrl(dev, cap); if (result) goto free_q; -- cgit v1.2.3 From 44af146a84fa4a8e136d824207dcd356958a112b Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Sat, 4 May 2013 06:43:17 -0400 Subject: NVMe: Only clear the enable bit when disabling controller Many of the bits in the Controller Configuration register may only be modified when the Enable bit is clear. Clearing them at the same time as the Enable bit might be OK, but let's play it safe and only touch the Enable bit. Signed-off-by: Matthew Wilcox Reviewed-by: Keith Busch --- drivers/block/nvme-core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 2b1b5a74dc7..310d573b9e8 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1137,7 +1137,10 @@ static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled) */ static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap) { - writel(0, &dev->bar->cc); + u32 cc = readl(&dev->bar->cc); + + if (cc & NVME_CC_ENABLE) + writel(cc & ~NVME_CC_ENABLE, &dev->bar->cc); return nvme_wait_ready(dev, cap, false); } -- cgit v1.2.3 From ab3ea5bf37e7189e843e19e500e7af50e802b5f6 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 6 May 2013 08:22:18 -0400 Subject: NVMe: Simplify Firmware Activate code slightly Add definitions for the three Firmware Activate actions, and change the SCSI translation code to construct the command into a temporary variable instead of translating the endianness back-and-forth. Signed-off-by: Matthew Wilcox Reviewed-by: Vishal Verma --- drivers/block/nvme-scsi.c | 6 ++---- include/linux/nvme.h | 3 +++ 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c index 1c0710ca55d..fed54b03989 100644 --- a/drivers/block/nvme-scsi.c +++ b/drivers/block/nvme-scsi.c @@ -1577,10 +1577,8 @@ static int nvme_trans_send_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, c.dlfw.numd = cpu_to_le32((tot_len/BYTES_TO_DWORDS) - 1); c.dlfw.offset = cpu_to_le32(offset/BYTES_TO_DWORDS); } else if (opcode == nvme_admin_activate_fw) { - c.common.cdw10[0] = cpu_to_le32(buffer_id); - /* AA=01b Replace & activate at reset */ - c.common.cdw10[0] = cpu_to_le32(le32_to_cpu( - c.common.cdw10[0]) | 0x00000008); + u32 cdw10 = buffer_id | NVME_FWACT_REPL_ACTV; + c.common.cdw10[0] = cpu_to_le32(cdw10); } nvme_sc = nvme_submit_admin_cmd(dev, &c, NULL); diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 971ef086ed6..f451c8d6e23 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -316,6 +316,9 @@ enum { NVME_FEAT_WRITE_ATOMIC = 0x0a, NVME_FEAT_ASYNC_EVENT = 0x0b, NVME_FEAT_SW_PROGRESS = 0x0c, + NVME_FWACT_REPL = (0 << 3), + NVME_FWACT_REPL_ACTV = (1 << 3), + NVME_FWACT_ACTV = (2 << 3), }; struct nvme_identify { -- cgit v1.2.3 From 94f370cab6e5ac514b658c6b2b3aa308cefc5c7a Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 9 May 2013 14:01:38 -0600 Subject: NVMe: Use user defined admin ioctl timeout Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 310d573b9e8..8efdfaa44a5 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1392,6 +1392,7 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev, struct nvme_command c; int status, length; struct nvme_iod *uninitialized_var(iod); + unsigned timeout; if (!capable(CAP_SYS_ADMIN)) return -EACCES; @@ -1421,10 +1422,13 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev, GFP_KERNEL); } + timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) : + ADMIN_TIMEOUT; if (length != cmd.data_len) status = -ENOMEM; else - status = nvme_submit_admin_cmd(dev, &c, &cmd.result); + status = nvme_submit_sync_cmd(dev->queues[0], &c, &cmd.result, + timeout); if (cmd.data_len) { nvme_unmap_user_pages(dev, cmd.opcode & 1, iod); -- cgit v1.2.3