// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2017, Microsoft Corporation. * Copyright (c) 2025, Stefan Metzmacher */ #include "internal.h" /* * Allocate MRs used for RDMA read/write * The number of MRs will not exceed hardware capability in responder_resources * All MRs are kept in mr_list. The MR can be recovered after it's used * Recovery is done in smbd_mr_recovery_work. The content of list entry changes * as MRs are used and recovered for I/O, but the list links will not change */ int smbdirect_connection_create_mr_list(struct smbdirect_socket *sc) { const struct smbdirect_socket_parameters *sp = &sc->parameters; struct smbdirect_mr_io *mr; int ret; u32 i; if (sp->responder_resources == 0) { smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR, "responder_resources negotiated as 0\n"); return -EINVAL; } /* Allocate more MRs (2x) than hardware responder_resources */ for (i = 0; i < sp->responder_resources * 2; i++) { mr = kzalloc_obj(*mr); if (!mr) { ret = -ENOMEM; goto kzalloc_mr_failed; } kref_init(&mr->kref); mutex_init(&mr->mutex); mr->mr = ib_alloc_mr(sc->ib.pd, sc->mr_io.type, sp->max_frmr_depth); if (IS_ERR(mr->mr)) { ret = PTR_ERR(mr->mr); smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR, "ib_alloc_mr failed ret=%d (%1pe) type=0x%x max_frmr_depth=%u\n", ret, SMBDIRECT_DEBUG_ERR_PTR(ret), sc->mr_io.type, sp->max_frmr_depth); goto ib_alloc_mr_failed; } mr->sgt.sgl = kzalloc_objs(struct scatterlist, sp->max_frmr_depth); if (!mr->sgt.sgl) { ret = -ENOMEM; smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR, "failed to allocate sgl, max_frmr_depth=%u\n", sp->max_frmr_depth); goto kcalloc_sgl_failed; } mr->state = SMBDIRECT_MR_READY; mr->socket = sc; list_add_tail(&mr->list, &sc->mr_io.all.list); atomic_inc(&sc->mr_io.ready.count); } return 0; kcalloc_sgl_failed: ib_dereg_mr(mr->mr); ib_alloc_mr_failed: mutex_destroy(&mr->mutex); kfree(mr); kzalloc_mr_failed: smbdirect_connection_destroy_mr_list(sc); return ret; } static void smbdirect_mr_io_disable_locked(struct smbdirect_mr_io *mr) { struct smbdirect_socket *sc = mr->socket; lockdep_assert_held(&mr->mutex); if (mr->state == SMBDIRECT_MR_DISABLED) return; if (mr->mr) ib_dereg_mr(mr->mr); if (mr->sgt.nents) ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir); kfree(mr->sgt.sgl); mr->mr = NULL; mr->sgt.sgl = NULL; mr->sgt.nents = 0; mr->state = SMBDIRECT_MR_DISABLED; } static void smbdirect_mr_io_free_locked(struct kref *kref) { struct smbdirect_mr_io *mr = container_of(kref, struct smbdirect_mr_io, kref); lockdep_assert_held(&mr->mutex); /* * smbdirect_mr_io_disable_locked() should already be called! */ if (WARN_ON_ONCE(mr->state != SMBDIRECT_MR_DISABLED)) smbdirect_mr_io_disable_locked(mr); mutex_unlock(&mr->mutex); mutex_destroy(&mr->mutex); kfree(mr); } void smbdirect_connection_destroy_mr_list(struct smbdirect_socket *sc) { struct smbdirect_mr_io *mr, *tmp; LIST_HEAD(all_list); unsigned long flags; spin_lock_irqsave(&sc->mr_io.all.lock, flags); list_splice_tail_init(&sc->mr_io.all.list, &all_list); spin_unlock_irqrestore(&sc->mr_io.all.lock, flags); list_for_each_entry_safe(mr, tmp, &all_list, list) { mutex_lock(&mr->mutex); smbdirect_mr_io_disable_locked(mr); list_del(&mr->list); mr->socket = NULL; /* * No kref_put_mutex() as it's already locked. * * If smbdirect_mr_io_free_locked() is called * and the mutex is unlocked and mr is gone, * in that case kref_put() returned 1. * * If kref_put() returned 0 we know that * smbdirect_mr_io_free_locked() didn't * run. Not by us nor by anyone else, as we * still hold the mutex, so we need to unlock. * * If the mr is still registered it will * be dangling (detached from the connection * waiting for smbd_deregister_mr() to be * called in order to free the memory. */ if (!kref_put(&mr->kref, smbdirect_mr_io_free_locked)) mutex_unlock(&mr->mutex); } } /* * Get a MR from mr_list. This function waits until there is at least one MR * available in the list. There may be several CPUs issuing I/O trying to get MR * at the same time, mr_list_lock is used to protect this situation. */ static struct smbdirect_mr_io * smbdirect_connection_get_mr_io(struct smbdirect_socket *sc) { struct smbdirect_mr_io *mr; unsigned long flags; int ret; again: ret = wait_event_interruptible(sc->mr_io.ready.wait_queue, atomic_read(&sc->mr_io.ready.count) || sc->status != SMBDIRECT_SOCKET_CONNECTED); if (ret) { smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR, "wait_event_interruptible ret=%d (%1pe)\n", ret, SMBDIRECT_DEBUG_ERR_PTR(ret)); return NULL; } if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR, "sc->status=%s sc->first_error=%1pe\n", smbdirect_socket_status_string(sc->status), SMBDIRECT_DEBUG_ERR_PTR(sc->first_error)); return NULL; } spin_lock_irqsave(&sc->mr_io.all.lock, flags); list_for_each_entry(mr, &sc->mr_io.all.list, list) { if (mr->state == SMBDIRECT_MR_READY) { mr->state = SMBDIRECT_MR_REGISTERED; kref_get(&mr->kref); spin_unlock_irqrestore(&sc->mr_io.all.lock, flags); atomic_dec(&sc->mr_io.ready.count); atomic_inc(&sc->mr_io.used.count); return mr; } } spin_unlock_irqrestore(&sc->mr_io.all.lock, flags); /* * It is possible that we could fail to get MR because other processes may * try to acquire a MR at the same time. If this is the case, retry it. */ goto again; } static void smbdirect_connection_mr_io_register_done(struct ib_cq *cq, struct ib_wc *wc) { struct smbdirect_mr_io *mr = container_of(wc->wr_cqe, struct smbdirect_mr_io, cqe); struct smbdirect_socket *sc = mr->socket; if (wc->status != IB_WC_SUCCESS) { smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR, "wc->status=%s opcode=%d\n", ib_wc_status_msg(wc->status), wc->opcode); smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED); } } static void smbdirect_connection_mr_io_local_inv_done(struct ib_cq *cq, struct ib_wc *wc) { struct smbdirect_mr_io *mr = container_of(wc->wr_cqe, struct smbdirect_mr_io, cqe); struct smbdirect_socket *sc = mr->socket; mr->state = SMBDIRECT_MR_INVALIDATED; if (wc->status != IB_WC_SUCCESS) { smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR, "invalidate failed status=%s\n", ib_wc_status_msg(wc->status)); smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED); } complete(&mr->invalidate_done); } /* * Transcribe the pages from an iterator into an MR scatterlist. */ static int smbdirect_iter_to_sgt(struct iov_iter *iter, struct sg_table *sgt, unsigned int max_sg) { int ret; memset(sgt->sgl, 0, max_sg * sizeof(struct scatterlist)); ret = extract_iter_to_sg(iter, iov_iter_count(iter), sgt, max_sg, 0); WARN_ON(ret < 0); if (sgt->nents > 0) sg_mark_end(&sgt->sgl[sgt->nents - 1]); return ret; } /* * Register memory for RDMA read/write * iter: the buffer to register memory with * writing: true if this is a RDMA write (SMB read), false for RDMA read * need_invalidate: true if this MR needs to be locally invalidated after I/O * return value: the MR registered, NULL if failed. */ struct smbdirect_mr_io * smbdirect_connection_register_mr_io(struct smbdirect_socket *sc, struct iov_iter *iter, bool writing, bool need_invalidate) { const struct smbdirect_socket_parameters *sp = &sc->parameters; struct smbdirect_mr_io *mr; int ret, num_pages; struct ib_reg_wr *reg_wr; num_pages = iov_iter_npages(iter, sp->max_frmr_depth + 1); if (num_pages > sp->max_frmr_depth) { smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR, "num_pages=%d max_frmr_depth=%d\n", num_pages, sp->max_frmr_depth); WARN_ON_ONCE(1); return NULL; } mr = smbdirect_connection_get_mr_io(sc); if (!mr) { smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR, "smbdirect_connection_get_mr_io returning NULL\n"); return NULL; } mutex_lock(&mr->mutex); mr->dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; mr->need_invalidate = need_invalidate; mr->sgt.nents = 0; mr->sgt.orig_nents = 0; smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_INFO, "num_pages=%u count=%zu depth=%u\n", num_pages, iov_iter_count(iter), sp->max_frmr_depth); smbdirect_iter_to_sgt(iter, &mr->sgt, sp->max_frmr_depth); ret = ib_dma_map_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir); if (!ret) { smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR, "ib_dma_map_sg num_pages=%u dir=%x ret=%d (%1pe)\n", num_pages, mr->dir, ret, SMBDIRECT_DEBUG_ERR_PTR(ret)); goto dma_map_error; } ret = ib_map_mr_sg(mr->mr, mr->sgt.sgl, mr->sgt.nents, NULL, PAGE_SIZE); if (ret != mr->sgt.nents) { smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR, "ib_map_mr_sg failed ret = %d nents = %u\n", ret, mr->sgt.nents); goto map_mr_error; } ib_update_fast_reg_key(mr->mr, ib_inc_rkey(mr->mr->rkey)); reg_wr = &mr->wr; reg_wr->wr.opcode = IB_WR_REG_MR; mr->cqe.done = smbdirect_connection_mr_io_register_done; reg_wr->wr.wr_cqe = &mr->cqe; reg_wr->wr.num_sge = 0; reg_wr->wr.send_flags = IB_SEND_SIGNALED; reg_wr->mr = mr->mr; reg_wr->key = mr->mr->rkey; reg_wr->access = writing ? IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : IB_ACCESS_REMOTE_READ; /* * There is no need for waiting for complemtion on ib_post_send * on IB_WR_REG_MR. Hardware enforces a barrier and order of execution * on the next ib_post_send when we actually send I/O to remote peer */ ret = ib_post_send(sc->ib.qp, ®_wr->wr, NULL); if (!ret) { /* * smbdirect_connection_get_mr_io() gave us a reference * via kref_get(&mr->kref), we keep that and let * the caller use smbdirect_connection_deregister_mr_io() * to remove it again. */ mutex_unlock(&mr->mutex); return mr; } smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR, "ib_post_send failed ret=%d (%1pe) reg_wr->key=0x%x\n", ret, SMBDIRECT_DEBUG_ERR_PTR(ret), reg_wr->key); map_mr_error: ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir); dma_map_error: mr->sgt.nents = 0; mr->state = SMBDIRECT_MR_ERROR; atomic_dec(&sc->mr_io.used.count); smbdirect_socket_schedule_cleanup(sc, ret); /* * smbdirect_connection_get_mr_io() gave us a reference * via kref_get(&mr->kref), we need to remove it again * on error. * * No kref_put_mutex() as it's already locked. * * If smbdirect_mr_io_free_locked() is called * and the mutex is unlocked and mr is gone, * in that case kref_put() returned 1. * * If kref_put() returned 0 we know that * smbdirect_mr_io_free_locked() didn't * run. Not by us nor by anyone else, as we * still hold the mutex, so we need to unlock. */ if (!kref_put(&mr->kref, smbdirect_mr_io_free_locked)) mutex_unlock(&mr->mutex); return NULL; } __SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_register_mr_io); void smbdirect_mr_io_fill_buffer_descriptor(struct smbdirect_mr_io *mr, struct smbdirect_buffer_descriptor_v1 *v1) { mutex_lock(&mr->mutex); if (mr->state == SMBDIRECT_MR_REGISTERED) { v1->offset = cpu_to_le64(mr->mr->iova); v1->token = cpu_to_le32(mr->mr->rkey); v1->length = cpu_to_le32(mr->mr->length); } else { v1->offset = cpu_to_le64(U64_MAX); v1->token = cpu_to_le32(U32_MAX); v1->length = cpu_to_le32(U32_MAX); } mutex_unlock(&mr->mutex); } __SMBDIRECT_EXPORT_SYMBOL__(smbdirect_mr_io_fill_buffer_descriptor); /* * Deregister a MR after I/O is done * This function may wait if remote invalidation is not used * and we have to locally invalidate the buffer to prevent data is being * modified by remote peer after upper layer consumes it */ void smbdirect_connection_deregister_mr_io(struct smbdirect_mr_io *mr) { struct smbdirect_socket *sc = mr->socket; int ret = 0; lock_again: mutex_lock(&mr->mutex); if (mr->state == SMBDIRECT_MR_DISABLED) goto put_kref; if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { smbdirect_mr_io_disable_locked(mr); goto put_kref; } if (mr->need_invalidate) { struct ib_send_wr *wr = &mr->inv_wr; /* Need to finish local invalidation before returning */ wr->opcode = IB_WR_LOCAL_INV; mr->cqe.done = smbdirect_connection_mr_io_local_inv_done; wr->wr_cqe = &mr->cqe; wr->num_sge = 0; wr->ex.invalidate_rkey = mr->mr->rkey; wr->send_flags = IB_SEND_SIGNALED; init_completion(&mr->invalidate_done); ret = ib_post_send(sc->ib.qp, wr, NULL); if (ret) { smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR, "ib_post_send failed ret=%d (%1pe)\n", ret, SMBDIRECT_DEBUG_ERR_PTR(ret)); smbdirect_mr_io_disable_locked(mr); smbdirect_socket_schedule_cleanup(sc, ret); goto done; } /* * We still hold the reference to mr * so we can unlock while waiting. */ mutex_unlock(&mr->mutex); wait_for_completion(&mr->invalidate_done); mr->need_invalidate = false; goto lock_again; } else /* * For remote invalidation, just set it to SMBDIRECT_MR_INVALIDATED * and defer to mr_recovery_work to recover the MR for next use */ mr->state = SMBDIRECT_MR_INVALIDATED; if (mr->sgt.nents) { ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir); mr->sgt.nents = 0; } WARN_ONCE(mr->state != SMBDIRECT_MR_INVALIDATED, "mr->state[%u] != SMBDIRECT_MR_INVALIDATED[%u]\n", mr->state, SMBDIRECT_MR_INVALIDATED); mr->state = SMBDIRECT_MR_READY; if (atomic_inc_return(&sc->mr_io.ready.count) == 1) wake_up(&sc->mr_io.ready.wait_queue); done: atomic_dec(&sc->mr_io.used.count); put_kref: /* * No kref_put_mutex() as it's already locked. * * If smbdirect_mr_io_free_locked() is called * and the mutex is unlocked and mr is gone, * in that case kref_put() returned 1. * * If kref_put() returned 0 we know that * smbdirect_mr_io_free_locked() didn't * run. Not by us nor by anyone else, as we * still hold the mutex, so we need to unlock * and keep the mr in SMBDIRECT_MR_READY or * SMBDIRECT_MR_ERROR state. */ if (!kref_put(&mr->kref, smbdirect_mr_io_free_locked)) mutex_unlock(&mr->mutex); } __SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_deregister_mr_io);