|
45 | 45 | #include <linux/cdev.h>
|
46 | 46 | #include <linux/anon_inodes.h>
|
47 | 47 | #include <linux/slab.h>
|
| 48 | +#include <linux/sched/mm.h> |
48 | 49 |
|
49 | 50 | #include <linux/uaccess.h>
|
50 | 51 |
|
@@ -811,6 +812,226 @@ static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
|
811 | 812 | return ret;
|
812 | 813 | }
|
813 | 814 |
|
| 815 | +/* |
| 816 | + * Each time we map IO memory into user space this keeps track of the mapping. |
| 817 | + * When the device is hot-unplugged we 'zap' the mmaps in user space to point |
| 818 | + * to the zero page and allow the hot unplug to proceed. |
| 819 | + * |
| 820 | + * This is necessary for cases like PCI physical hot unplug as the actual BAR |
| 821 | + * memory may vanish after this and access to it from userspace could MCE. |
| 822 | + * |
| 823 | + * RDMA drivers supporting disassociation must have their user space designed |
| 824 | + * to cope in some way with their IO pages going to the zero page. |
| 825 | + */ |
| 826 | +struct rdma_umap_priv { |
| 827 | + struct vm_area_struct *vma; |
| 828 | + struct list_head list; |
| 829 | +}; |
| 830 | + |
| 831 | +static const struct vm_operations_struct rdma_umap_ops; |
| 832 | + |
| 833 | +static void rdma_umap_priv_init(struct rdma_umap_priv *priv, |
| 834 | + struct vm_area_struct *vma) |
| 835 | +{ |
| 836 | + struct ib_uverbs_file *ufile = vma->vm_file->private_data; |
| 837 | + |
| 838 | + priv->vma = vma; |
| 839 | + vma->vm_private_data = priv; |
| 840 | + vma->vm_ops = &rdma_umap_ops; |
| 841 | + |
| 842 | + mutex_lock(&ufile->umap_lock); |
| 843 | + list_add(&priv->list, &ufile->umaps); |
| 844 | + mutex_unlock(&ufile->umap_lock); |
| 845 | +} |
| 846 | + |
| 847 | +/* |
| 848 | + * The VMA has been dup'd, initialize the vm_private_data with a new tracking |
| 849 | + * struct |
| 850 | + */ |
| 851 | +static void rdma_umap_open(struct vm_area_struct *vma) |
| 852 | +{ |
| 853 | + struct ib_uverbs_file *ufile = vma->vm_file->private_data; |
| 854 | + struct rdma_umap_priv *opriv = vma->vm_private_data; |
| 855 | + struct rdma_umap_priv *priv; |
| 856 | + |
| 857 | + if (!opriv) |
| 858 | + return; |
| 859 | + |
| 860 | + /* We are racing with disassociation */ |
| 861 | + if (!down_read_trylock(&ufile->hw_destroy_rwsem)) |
| 862 | + goto out_zap; |
| 863 | + /* |
| 864 | + * Disassociation already completed, the VMA should already be zapped. |
| 865 | + */ |
| 866 | + if (!ufile->ucontext) |
| 867 | + goto out_unlock; |
| 868 | + |
| 869 | + priv = kzalloc(sizeof(*priv), GFP_KERNEL); |
| 870 | + if (!priv) |
| 871 | + goto out_unlock; |
| 872 | + rdma_umap_priv_init(priv, vma); |
| 873 | + |
| 874 | + up_read(&ufile->hw_destroy_rwsem); |
| 875 | + return; |
| 876 | + |
| 877 | +out_unlock: |
| 878 | + up_read(&ufile->hw_destroy_rwsem); |
| 879 | +out_zap: |
| 880 | + /* |
| 881 | + * We can't allow the VMA to be created with the actual IO pages, that |
| 882 | + * would break our API contract, and it can't be stopped at this |
| 883 | + * point, so zap it. |
| 884 | + */ |
| 885 | + vma->vm_private_data = NULL; |
| 886 | + zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); |
| 887 | +} |
| 888 | + |
| 889 | +static void rdma_umap_close(struct vm_area_struct *vma) |
| 890 | +{ |
| 891 | + struct ib_uverbs_file *ufile = vma->vm_file->private_data; |
| 892 | + struct rdma_umap_priv *priv = vma->vm_private_data; |
| 893 | + |
| 894 | + if (!priv) |
| 895 | + return; |
| 896 | + |
| 897 | + /* |
| 898 | + * The vma holds a reference on the struct file that created it, which |
| 899 | + * in turn means that the ib_uverbs_file is guaranteed to exist at |
| 900 | + * this point. |
| 901 | + */ |
| 902 | + mutex_lock(&ufile->umap_lock); |
| 903 | + list_del(&priv->list); |
| 904 | + mutex_unlock(&ufile->umap_lock); |
| 905 | + kfree(priv); |
| 906 | +} |
| 907 | + |
| 908 | +static const struct vm_operations_struct rdma_umap_ops = { |
| 909 | + .open = rdma_umap_open, |
| 910 | + .close = rdma_umap_close, |
| 911 | +}; |
| 912 | + |
| 913 | +static struct rdma_umap_priv *rdma_user_mmap_pre(struct ib_ucontext *ucontext, |
| 914 | + struct vm_area_struct *vma, |
| 915 | + unsigned long size) |
| 916 | +{ |
| 917 | + struct ib_uverbs_file *ufile = ucontext->ufile; |
| 918 | + struct rdma_umap_priv *priv; |
| 919 | + |
| 920 | + if (vma->vm_end - vma->vm_start != size) |
| 921 | + return ERR_PTR(-EINVAL); |
| 922 | + |
| 923 | + /* Driver is using this wrong, must be called by ib_uverbs_mmap */ |
| 924 | + if (WARN_ON(!vma->vm_file || |
| 925 | + vma->vm_file->private_data != ufile)) |
| 926 | + return ERR_PTR(-EINVAL); |
| 927 | + lockdep_assert_held(&ufile->device->disassociate_srcu); |
| 928 | + |
| 929 | + priv = kzalloc(sizeof(*priv), GFP_KERNEL); |
| 930 | + if (!priv) |
| 931 | + return ERR_PTR(-ENOMEM); |
| 932 | + return priv; |
| 933 | +} |
| 934 | + |
| 935 | +/* |
| 936 | + * Map IO memory into a process. This is to be called by drivers as part of |
| 937 | + * their mmap() functions if they wish to send something like PCI-E BAR memory |
| 938 | + * to userspace. |
| 939 | + */ |
| 940 | +int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma, |
| 941 | + unsigned long pfn, unsigned long size, pgprot_t prot) |
| 942 | +{ |
| 943 | + struct rdma_umap_priv *priv = rdma_user_mmap_pre(ucontext, vma, size); |
| 944 | + |
| 945 | + if (IS_ERR(priv)) |
| 946 | + return PTR_ERR(priv); |
| 947 | + |
| 948 | + vma->vm_page_prot = prot; |
| 949 | + if (io_remap_pfn_range(vma, vma->vm_start, pfn, size, prot)) { |
| 950 | + kfree(priv); |
| 951 | + return -EAGAIN; |
| 952 | + } |
| 953 | + |
| 954 | + rdma_umap_priv_init(priv, vma); |
| 955 | + return 0; |
| 956 | +} |
| 957 | +EXPORT_SYMBOL(rdma_user_mmap_io); |
| 958 | + |
| 959 | +/* |
| 960 | + * The page case is here for a slightly different reason, the driver expects |
| 961 | + * to be able to free the page it is sharing to user space when it destroys |
| 962 | + * its ucontext, which means we need to zap the user space references. |
| 963 | + * |
| 964 | + * We could handle this differently by providing an API to allocate a shared |
| 965 | + * page and then only freeing the shared page when the last ufile is |
| 966 | + * destroyed. |
| 967 | + */ |
| 968 | +int rdma_user_mmap_page(struct ib_ucontext *ucontext, |
| 969 | + struct vm_area_struct *vma, struct page *page, |
| 970 | + unsigned long size) |
| 971 | +{ |
| 972 | + struct rdma_umap_priv *priv = rdma_user_mmap_pre(ucontext, vma, size); |
| 973 | + |
| 974 | + if (IS_ERR(priv)) |
| 975 | + return PTR_ERR(priv); |
| 976 | + |
| 977 | + if (remap_pfn_range(vma, vma->vm_start, page_to_pfn(page), size, |
| 978 | + vma->vm_page_prot)) { |
| 979 | + kfree(priv); |
| 980 | + return -EAGAIN; |
| 981 | + } |
| 982 | + |
| 983 | + rdma_umap_priv_init(priv, vma); |
| 984 | + return 0; |
| 985 | +} |
| 986 | +EXPORT_SYMBOL(rdma_user_mmap_page); |
| 987 | + |
| 988 | +void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) |
| 989 | +{ |
| 990 | + struct rdma_umap_priv *priv, *next_priv; |
| 991 | + |
| 992 | + lockdep_assert_held(&ufile->hw_destroy_rwsem); |
| 993 | + |
| 994 | + while (1) { |
| 995 | + struct mm_struct *mm = NULL; |
| 996 | + |
| 997 | + /* Get an arbitrary mm pointer that hasn't been cleaned yet */ |
| 998 | + mutex_lock(&ufile->umap_lock); |
| 999 | + if (!list_empty(&ufile->umaps)) { |
| 1000 | + mm = list_first_entry(&ufile->umaps, |
| 1001 | + struct rdma_umap_priv, list) |
| 1002 | + ->vma->vm_mm; |
| 1003 | + mmget(mm); |
| 1004 | + } |
| 1005 | + mutex_unlock(&ufile->umap_lock); |
| 1006 | + if (!mm) |
| 1007 | + return; |
| 1008 | + |
| 1009 | + /* |
| 1010 | + * The umap_lock is nested under mmap_sem since it used within |
| 1011 | + * the vma_ops callbacks, so we have to clean the list one mm |
| 1012 | + * at a time to get the lock ordering right. Typically there |
| 1013 | + * will only be one mm, so no big deal. |
| 1014 | + */ |
| 1015 | + down_write(&mm->mmap_sem); |
| 1016 | + mutex_lock(&ufile->umap_lock); |
| 1017 | + list_for_each_entry_safe (priv, next_priv, &ufile->umaps, |
| 1018 | + list) { |
| 1019 | + struct vm_area_struct *vma = priv->vma; |
| 1020 | + |
| 1021 | + if (vma->vm_mm != mm) |
| 1022 | + continue; |
| 1023 | + list_del_init(&priv->list); |
| 1024 | + |
| 1025 | + zap_vma_ptes(vma, vma->vm_start, |
| 1026 | + vma->vm_end - vma->vm_start); |
| 1027 | + vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE); |
| 1028 | + } |
| 1029 | + mutex_unlock(&ufile->umap_lock); |
| 1030 | + up_write(&mm->mmap_sem); |
| 1031 | + mmput(mm); |
| 1032 | + } |
| 1033 | +} |
| 1034 | + |
814 | 1035 | /*
|
815 | 1036 | * ib_uverbs_open() does not need the BKL:
|
816 | 1037 | *
|
@@ -872,6 +1093,8 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
|
872 | 1093 | spin_lock_init(&file->uobjects_lock);
|
873 | 1094 | INIT_LIST_HEAD(&file->uobjects);
|
874 | 1095 | init_rwsem(&file->hw_destroy_rwsem);
|
| 1096 | + mutex_init(&file->umap_lock); |
| 1097 | + INIT_LIST_HEAD(&file->umaps); |
875 | 1098 |
|
876 | 1099 | filp->private_data = file;
|
877 | 1100 | list_add_tail(&file->list, &dev->uverbs_file_list);
|
|
0 commit comments