Skip to content

Commit 5f9794d

Browse files
jgunthorpedledford
authored andcommitted
RDMA/ucontext: Add a core API for mmaping driver IO memory
To support disassociation and PCI hot unplug, we have to track all the VMAs that refer to the device IO memory. When disassociation occurs the VMAs have to be revised to point to the zero page, not the IO memory, to allow the physical HW to be unplugged. The three drivers supporting this implemented three different versions of this algorithm, all leaving something to be desired. This new common implementation has a few differences from the driver versions: - Track all VMAs, including splitting/truncating/etc. Tie the lifetime of the private data allocation to the lifetime of the vma. This avoids any tricks with setting vm_ops which Linus didn't like. (see link) - Support multiple mms, and support properly tracking mmaps triggered by processes other than the one first opening the uverbs fd. This makes fork behavior of disassociation enabled drivers the same as fork support in normal drivers. - Don't use crazy get_task stuff. - Simplify the approach for to racing between vm_ops close and disassociation, fixing the related bugs most of the driver implementations had. Since we are in core code the tracking list can be placed in struct ib_uverbs_ufile, which has a lifetime strictly longer than any VMAs created by mmap on the uverbs FD. Link: https://www.spinics.net/lists/stable/msg248747.html Link: https://lkml.kernel.org/r/CA+55aFxJTV_g46AQPoPXen-UPiqR1HGMZictt7VpC-SMFbm3Cw@mail.gmail.com Signed-off-by: Jason Gunthorpe <jgg@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
1 parent b00a92c commit 5f9794d

File tree

5 files changed

+252
-1
lines changed

5 files changed

+252
-1
lines changed

drivers/infiniband/core/rdma_core.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -842,8 +842,10 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile,
842842
struct ib_ucontext *ucontext = ufile->ucontext;
843843
int ret;
844844

845-
if (reason == RDMA_REMOVE_DRIVER_REMOVE)
845+
if (reason == RDMA_REMOVE_DRIVER_REMOVE) {
846+
uverbs_user_mmap_disassociate(ufile);
846847
ufile_disassociate_ucontext(ucontext);
848+
}
847849

848850
put_pid(ucontext->tgid);
849851
ib_rdmacg_uncharge(&ucontext->cg_obj, ucontext->device,

drivers/infiniband/core/rdma_core.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,5 +160,6 @@ void uverbs_disassociate_api(struct uverbs_api *uapi);
160160
void uverbs_destroy_api(struct uverbs_api *uapi);
161161
void uapi_compute_bundle_size(struct uverbs_api_ioctl_method *method_elm,
162162
unsigned int num_attrs);
163+
void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile);
163164

164165
#endif /* RDMA_CORE_H */

drivers/infiniband/core/uverbs.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,9 @@ struct ib_uverbs_file {
158158
spinlock_t uobjects_lock;
159159
struct list_head uobjects;
160160

161+
struct mutex umap_lock;
162+
struct list_head umaps;
163+
161164
u64 uverbs_cmd_mask;
162165
u64 uverbs_ex_cmd_mask;
163166

drivers/infiniband/core/uverbs_main.c

Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
#include <linux/cdev.h>
4646
#include <linux/anon_inodes.h>
4747
#include <linux/slab.h>
48+
#include <linux/sched/mm.h>
4849

4950
#include <linux/uaccess.h>
5051

@@ -811,6 +812,226 @@ static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
811812
return ret;
812813
}
813814

815+
/*
816+
* Each time we map IO memory into user space this keeps track of the mapping.
817+
* When the device is hot-unplugged we 'zap' the mmaps in user space to point
818+
* to the zero page and allow the hot unplug to proceed.
819+
*
820+
* This is necessary for cases like PCI physical hot unplug as the actual BAR
821+
* memory may vanish after this and access to it from userspace could MCE.
822+
*
823+
* RDMA drivers supporting disassociation must have their user space designed
824+
* to cope in some way with their IO pages going to the zero page.
825+
*/
826+
struct rdma_umap_priv {
827+
struct vm_area_struct *vma;
828+
struct list_head list;
829+
};
830+
831+
static const struct vm_operations_struct rdma_umap_ops;
832+
833+
static void rdma_umap_priv_init(struct rdma_umap_priv *priv,
834+
struct vm_area_struct *vma)
835+
{
836+
struct ib_uverbs_file *ufile = vma->vm_file->private_data;
837+
838+
priv->vma = vma;
839+
vma->vm_private_data = priv;
840+
vma->vm_ops = &rdma_umap_ops;
841+
842+
mutex_lock(&ufile->umap_lock);
843+
list_add(&priv->list, &ufile->umaps);
844+
mutex_unlock(&ufile->umap_lock);
845+
}
846+
847+
/*
848+
* The VMA has been dup'd, initialize the vm_private_data with a new tracking
849+
* struct
850+
*/
851+
static void rdma_umap_open(struct vm_area_struct *vma)
852+
{
853+
struct ib_uverbs_file *ufile = vma->vm_file->private_data;
854+
struct rdma_umap_priv *opriv = vma->vm_private_data;
855+
struct rdma_umap_priv *priv;
856+
857+
if (!opriv)
858+
return;
859+
860+
/* We are racing with disassociation */
861+
if (!down_read_trylock(&ufile->hw_destroy_rwsem))
862+
goto out_zap;
863+
/*
864+
* Disassociation already completed, the VMA should already be zapped.
865+
*/
866+
if (!ufile->ucontext)
867+
goto out_unlock;
868+
869+
priv = kzalloc(sizeof(*priv), GFP_KERNEL);
870+
if (!priv)
871+
goto out_unlock;
872+
rdma_umap_priv_init(priv, vma);
873+
874+
up_read(&ufile->hw_destroy_rwsem);
875+
return;
876+
877+
out_unlock:
878+
up_read(&ufile->hw_destroy_rwsem);
879+
out_zap:
880+
/*
881+
* We can't allow the VMA to be created with the actual IO pages, that
882+
* would break our API contract, and it can't be stopped at this
883+
* point, so zap it.
884+
*/
885+
vma->vm_private_data = NULL;
886+
zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
887+
}
888+
889+
static void rdma_umap_close(struct vm_area_struct *vma)
890+
{
891+
struct ib_uverbs_file *ufile = vma->vm_file->private_data;
892+
struct rdma_umap_priv *priv = vma->vm_private_data;
893+
894+
if (!priv)
895+
return;
896+
897+
/*
898+
* The vma holds a reference on the struct file that created it, which
899+
* in turn means that the ib_uverbs_file is guaranteed to exist at
900+
* this point.
901+
*/
902+
mutex_lock(&ufile->umap_lock);
903+
list_del(&priv->list);
904+
mutex_unlock(&ufile->umap_lock);
905+
kfree(priv);
906+
}
907+
908+
static const struct vm_operations_struct rdma_umap_ops = {
909+
.open = rdma_umap_open,
910+
.close = rdma_umap_close,
911+
};
912+
913+
static struct rdma_umap_priv *rdma_user_mmap_pre(struct ib_ucontext *ucontext,
914+
struct vm_area_struct *vma,
915+
unsigned long size)
916+
{
917+
struct ib_uverbs_file *ufile = ucontext->ufile;
918+
struct rdma_umap_priv *priv;
919+
920+
if (vma->vm_end - vma->vm_start != size)
921+
return ERR_PTR(-EINVAL);
922+
923+
/* Driver is using this wrong, must be called by ib_uverbs_mmap */
924+
if (WARN_ON(!vma->vm_file ||
925+
vma->vm_file->private_data != ufile))
926+
return ERR_PTR(-EINVAL);
927+
lockdep_assert_held(&ufile->device->disassociate_srcu);
928+
929+
priv = kzalloc(sizeof(*priv), GFP_KERNEL);
930+
if (!priv)
931+
return ERR_PTR(-ENOMEM);
932+
return priv;
933+
}
934+
935+
/*
936+
* Map IO memory into a process. This is to be called by drivers as part of
937+
* their mmap() functions if they wish to send something like PCI-E BAR memory
938+
* to userspace.
939+
*/
940+
int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma,
941+
unsigned long pfn, unsigned long size, pgprot_t prot)
942+
{
943+
struct rdma_umap_priv *priv = rdma_user_mmap_pre(ucontext, vma, size);
944+
945+
if (IS_ERR(priv))
946+
return PTR_ERR(priv);
947+
948+
vma->vm_page_prot = prot;
949+
if (io_remap_pfn_range(vma, vma->vm_start, pfn, size, prot)) {
950+
kfree(priv);
951+
return -EAGAIN;
952+
}
953+
954+
rdma_umap_priv_init(priv, vma);
955+
return 0;
956+
}
957+
EXPORT_SYMBOL(rdma_user_mmap_io);
958+
959+
/*
960+
* The page case is here for a slightly different reason, the driver expects
961+
* to be able to free the page it is sharing to user space when it destroys
962+
* its ucontext, which means we need to zap the user space references.
963+
*
964+
* We could handle this differently by providing an API to allocate a shared
965+
* page and then only freeing the shared page when the last ufile is
966+
* destroyed.
967+
*/
968+
int rdma_user_mmap_page(struct ib_ucontext *ucontext,
969+
struct vm_area_struct *vma, struct page *page,
970+
unsigned long size)
971+
{
972+
struct rdma_umap_priv *priv = rdma_user_mmap_pre(ucontext, vma, size);
973+
974+
if (IS_ERR(priv))
975+
return PTR_ERR(priv);
976+
977+
if (remap_pfn_range(vma, vma->vm_start, page_to_pfn(page), size,
978+
vma->vm_page_prot)) {
979+
kfree(priv);
980+
return -EAGAIN;
981+
}
982+
983+
rdma_umap_priv_init(priv, vma);
984+
return 0;
985+
}
986+
EXPORT_SYMBOL(rdma_user_mmap_page);
987+
988+
void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile)
989+
{
990+
struct rdma_umap_priv *priv, *next_priv;
991+
992+
lockdep_assert_held(&ufile->hw_destroy_rwsem);
993+
994+
while (1) {
995+
struct mm_struct *mm = NULL;
996+
997+
/* Get an arbitrary mm pointer that hasn't been cleaned yet */
998+
mutex_lock(&ufile->umap_lock);
999+
if (!list_empty(&ufile->umaps)) {
1000+
mm = list_first_entry(&ufile->umaps,
1001+
struct rdma_umap_priv, list)
1002+
->vma->vm_mm;
1003+
mmget(mm);
1004+
}
1005+
mutex_unlock(&ufile->umap_lock);
1006+
if (!mm)
1007+
return;
1008+
1009+
/*
1010+
* The umap_lock is nested under mmap_sem since it used within
1011+
* the vma_ops callbacks, so we have to clean the list one mm
1012+
* at a time to get the lock ordering right. Typically there
1013+
* will only be one mm, so no big deal.
1014+
*/
1015+
down_write(&mm->mmap_sem);
1016+
mutex_lock(&ufile->umap_lock);
1017+
list_for_each_entry_safe (priv, next_priv, &ufile->umaps,
1018+
list) {
1019+
struct vm_area_struct *vma = priv->vma;
1020+
1021+
if (vma->vm_mm != mm)
1022+
continue;
1023+
list_del_init(&priv->list);
1024+
1025+
zap_vma_ptes(vma, vma->vm_start,
1026+
vma->vm_end - vma->vm_start);
1027+
vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE);
1028+
}
1029+
mutex_unlock(&ufile->umap_lock);
1030+
up_write(&mm->mmap_sem);
1031+
mmput(mm);
1032+
}
1033+
}
1034+
8141035
/*
8151036
* ib_uverbs_open() does not need the BKL:
8161037
*
@@ -872,6 +1093,8 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
8721093
spin_lock_init(&file->uobjects_lock);
8731094
INIT_LIST_HEAD(&file->uobjects);
8741095
init_rwsem(&file->hw_destroy_rwsem);
1096+
mutex_init(&file->umap_lock);
1097+
INIT_LIST_HEAD(&file->umaps);
8751098

8761099
filp->private_data = file;
8771100
list_add_tail(&file->list, &dev->uverbs_file_list);

include/rdma/ib_verbs.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2646,6 +2646,28 @@ void *ib_get_client_data(struct ib_device *device, struct ib_client *client);
26462646
void ib_set_client_data(struct ib_device *device, struct ib_client *client,
26472647
void *data);
26482648

2649+
#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)
2650+
int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma,
2651+
unsigned long pfn, unsigned long size, pgprot_t prot);
2652+
int rdma_user_mmap_page(struct ib_ucontext *ucontext,
2653+
struct vm_area_struct *vma, struct page *page,
2654+
unsigned long size);
2655+
#else
2656+
static inline int rdma_user_mmap_io(struct ib_ucontext *ucontext,
2657+
struct vm_area_struct *vma,
2658+
unsigned long pfn, unsigned long size,
2659+
pgprot_t prot)
2660+
{
2661+
return -EINVAL;
2662+
}
2663+
static inline int rdma_user_mmap_page(struct ib_ucontext *ucontext,
2664+
struct vm_area_struct *vma, struct page *page,
2665+
unsigned long size)
2666+
{
2667+
return -EINVAL;
2668+
}
2669+
#endif
2670+
26492671
static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t len)
26502672
{
26512673
return copy_from_user(dest, udata->inbuf, len) ? -EFAULT : 0;

0 commit comments

Comments
 (0)