1
1
// SPDX-License-Identifier: GPL-2.0
2
2
#include <linux/ceph/ceph_debug.h>
3
+ #include <linux/ceph/striper.h>
3
4
4
5
#include <linux/module.h>
5
6
#include <linux/sched.h>
@@ -1795,6 +1796,297 @@ static long ceph_fallocate(struct file *file, int mode,
1795
1796
return ret ;
1796
1797
}
1797
1798
1799
+ /*
1800
+ * This function tries to get FILE_WR capabilities for dst_ci and FILE_RD for
1801
+ * src_ci. Two attempts are made to obtain both caps, and an error is return if
1802
+ * this fails; zero is returned on success.
1803
+ */
1804
+ static int get_rd_wr_caps (struct ceph_inode_info * src_ci ,
1805
+ loff_t src_endoff , int * src_got ,
1806
+ struct ceph_inode_info * dst_ci ,
1807
+ loff_t dst_endoff , int * dst_got )
1808
+ {
1809
+ int ret = 0 ;
1810
+ bool retrying = false;
1811
+
1812
+ retry_caps :
1813
+ ret = ceph_get_caps (dst_ci , CEPH_CAP_FILE_WR , CEPH_CAP_FILE_BUFFER ,
1814
+ dst_endoff , dst_got , NULL );
1815
+ if (ret < 0 )
1816
+ return ret ;
1817
+
1818
+ /*
1819
+ * Since we're already holding the FILE_WR capability for the dst file,
1820
+ * we would risk a deadlock by using ceph_get_caps. Thus, we'll do some
1821
+ * retry dance instead to try to get both capabilities.
1822
+ */
1823
+ ret = ceph_try_get_caps (src_ci , CEPH_CAP_FILE_RD , CEPH_CAP_FILE_SHARED ,
1824
+ false, src_got );
1825
+ if (ret <= 0 ) {
1826
+ /* Start by dropping dst_ci caps and getting src_ci caps */
1827
+ ceph_put_cap_refs (dst_ci , * dst_got );
1828
+ if (retrying ) {
1829
+ if (!ret )
1830
+ /* ceph_try_get_caps masks EAGAIN */
1831
+ ret = - EAGAIN ;
1832
+ return ret ;
1833
+ }
1834
+ ret = ceph_get_caps (src_ci , CEPH_CAP_FILE_RD ,
1835
+ CEPH_CAP_FILE_SHARED , src_endoff ,
1836
+ src_got , NULL );
1837
+ if (ret < 0 )
1838
+ return ret ;
1839
+ /*... drop src_ci caps too, and retry */
1840
+ ceph_put_cap_refs (src_ci , * src_got );
1841
+ retrying = true;
1842
+ goto retry_caps ;
1843
+ }
1844
+ return ret ;
1845
+ }
1846
+
1847
+ static void put_rd_wr_caps (struct ceph_inode_info * src_ci , int src_got ,
1848
+ struct ceph_inode_info * dst_ci , int dst_got )
1849
+ {
1850
+ ceph_put_cap_refs (src_ci , src_got );
1851
+ ceph_put_cap_refs (dst_ci , dst_got );
1852
+ }
1853
+
1854
+ /*
1855
+ * This function does several size-related checks, returning an error if:
1856
+ * - source file is smaller than off+len
1857
+ * - destination file size is not OK (inode_newsize_ok())
1858
+ * - max bytes quotas is exceeded
1859
+ */
1860
+ static int is_file_size_ok (struct inode * src_inode , struct inode * dst_inode ,
1861
+ loff_t src_off , loff_t dst_off , size_t len )
1862
+ {
1863
+ loff_t size , endoff ;
1864
+
1865
+ size = i_size_read (src_inode );
1866
+ /*
1867
+ * Don't copy beyond source file EOF. Instead of simply setting length
1868
+ * to (size - src_off), just drop to VFS default implementation, as the
1869
+ * local i_size may be stale due to other clients writing to the source
1870
+ * inode.
1871
+ */
1872
+ if (src_off + len > size ) {
1873
+ dout ("Copy beyond EOF (%llu + %zu > %llu)\n" ,
1874
+ src_off , len , size );
1875
+ return - EOPNOTSUPP ;
1876
+ }
1877
+ size = i_size_read (dst_inode );
1878
+
1879
+ endoff = dst_off + len ;
1880
+ if (inode_newsize_ok (dst_inode , endoff ))
1881
+ return - EOPNOTSUPP ;
1882
+
1883
+ if (ceph_quota_is_max_bytes_exceeded (dst_inode , endoff ))
1884
+ return - EDQUOT ;
1885
+
1886
+ return 0 ;
1887
+ }
1888
+
1889
+ static ssize_t ceph_copy_file_range (struct file * src_file , loff_t src_off ,
1890
+ struct file * dst_file , loff_t dst_off ,
1891
+ size_t len , unsigned int flags )
1892
+ {
1893
+ struct inode * src_inode = file_inode (src_file );
1894
+ struct inode * dst_inode = file_inode (dst_file );
1895
+ struct ceph_inode_info * src_ci = ceph_inode (src_inode );
1896
+ struct ceph_inode_info * dst_ci = ceph_inode (dst_inode );
1897
+ struct ceph_cap_flush * prealloc_cf ;
1898
+ struct ceph_object_locator src_oloc , dst_oloc ;
1899
+ struct ceph_object_id src_oid , dst_oid ;
1900
+ loff_t endoff = 0 , size ;
1901
+ ssize_t ret = - EIO ;
1902
+ u64 src_objnum , dst_objnum , src_objoff , dst_objoff ;
1903
+ u32 src_objlen , dst_objlen , object_size ;
1904
+ int src_got = 0 , dst_got = 0 , err , dirty ;
1905
+ bool do_final_copy = false;
1906
+
1907
+ if (src_inode == dst_inode )
1908
+ return - EINVAL ;
1909
+ if (ceph_snap (dst_inode ) != CEPH_NOSNAP )
1910
+ return - EROFS ;
1911
+
1912
+ /*
1913
+ * Some of the checks below will return -EOPNOTSUPP, which will force a
1914
+ * fallback to the default VFS copy_file_range implementation. This is
1915
+ * desirable in several cases (for ex, the 'len' is smaller than the
1916
+ * size of the objects, or in cases where that would be more
1917
+ * efficient).
1918
+ */
1919
+
1920
+ if ((src_ci -> i_layout .stripe_unit != dst_ci -> i_layout .stripe_unit ) ||
1921
+ (src_ci -> i_layout .stripe_count != dst_ci -> i_layout .stripe_count ) ||
1922
+ (src_ci -> i_layout .object_size != dst_ci -> i_layout .object_size ))
1923
+ return - EOPNOTSUPP ;
1924
+
1925
+ if (len < src_ci -> i_layout .object_size )
1926
+ return - EOPNOTSUPP ; /* no remote copy will be done */
1927
+
1928
+ prealloc_cf = ceph_alloc_cap_flush ();
1929
+ if (!prealloc_cf )
1930
+ return - ENOMEM ;
1931
+
1932
+ /* Start by sync'ing the source file */
1933
+ ret = file_write_and_wait_range (src_file , src_off , (src_off + len ));
1934
+ if (ret < 0 )
1935
+ goto out ;
1936
+
1937
+ /*
1938
+ * We need FILE_WR caps for dst_ci and FILE_RD for src_ci as other
1939
+ * clients may have dirty data in their caches. And OSDs know nothing
1940
+ * about caps, so they can't safely do the remote object copies.
1941
+ */
1942
+ err = get_rd_wr_caps (src_ci , (src_off + len ), & src_got ,
1943
+ dst_ci , (dst_off + len ), & dst_got );
1944
+ if (err < 0 ) {
1945
+ dout ("get_rd_wr_caps returned %d\n" , err );
1946
+ ret = - EOPNOTSUPP ;
1947
+ goto out ;
1948
+ }
1949
+
1950
+ ret = is_file_size_ok (src_inode , dst_inode , src_off , dst_off , len );
1951
+ if (ret < 0 )
1952
+ goto out_caps ;
1953
+
1954
+ size = i_size_read (dst_inode );
1955
+ endoff = dst_off + len ;
1956
+
1957
+ /* Drop dst file cached pages */
1958
+ ret = invalidate_inode_pages2_range (dst_inode -> i_mapping ,
1959
+ dst_off >> PAGE_SHIFT ,
1960
+ endoff >> PAGE_SHIFT );
1961
+ if (ret < 0 ) {
1962
+ dout ("Failed to invalidate inode pages (%zd)\n" , ret );
1963
+ ret = 0 ; /* XXX */
1964
+ }
1965
+ src_oloc .pool = src_ci -> i_layout .pool_id ;
1966
+ src_oloc .pool_ns = ceph_try_get_string (src_ci -> i_layout .pool_ns );
1967
+ dst_oloc .pool = dst_ci -> i_layout .pool_id ;
1968
+ dst_oloc .pool_ns = ceph_try_get_string (dst_ci -> i_layout .pool_ns );
1969
+
1970
+ ceph_calc_file_object_mapping (& src_ci -> i_layout , src_off ,
1971
+ src_ci -> i_layout .object_size ,
1972
+ & src_objnum , & src_objoff , & src_objlen );
1973
+ ceph_calc_file_object_mapping (& dst_ci -> i_layout , dst_off ,
1974
+ dst_ci -> i_layout .object_size ,
1975
+ & dst_objnum , & dst_objoff , & dst_objlen );
1976
+ /* object-level offsets need to the same */
1977
+ if (src_objoff != dst_objoff ) {
1978
+ ret = - EOPNOTSUPP ;
1979
+ goto out_caps ;
1980
+ }
1981
+
1982
+ /*
1983
+ * Do a manual copy if the object offset isn't object aligned.
1984
+ * 'src_objlen' contains the bytes left until the end of the object,
1985
+ * starting at the src_off
1986
+ */
1987
+ if (src_objoff ) {
1988
+ /*
1989
+ * we need to temporarily drop all caps as we'll be calling
1990
+ * {read,write}_iter, which will get caps again.
1991
+ */
1992
+ put_rd_wr_caps (src_ci , src_got , dst_ci , dst_got );
1993
+ ret = do_splice_direct (src_file , & src_off , dst_file ,
1994
+ & dst_off , src_objlen , flags );
1995
+ if (ret < 0 ) {
1996
+ dout ("do_splice_direct returned %d\n" , err );
1997
+ goto out ;
1998
+ }
1999
+ len -= ret ;
2000
+ err = get_rd_wr_caps (src_ci , (src_off + len ),
2001
+ & src_got , dst_ci ,
2002
+ (dst_off + len ), & dst_got );
2003
+ if (err < 0 )
2004
+ goto out ;
2005
+ err = is_file_size_ok (src_inode , dst_inode ,
2006
+ src_off , dst_off , len );
2007
+ if (err < 0 )
2008
+ goto out_caps ;
2009
+ }
2010
+ object_size = src_ci -> i_layout .object_size ;
2011
+ while (len >= object_size ) {
2012
+ ceph_calc_file_object_mapping (& src_ci -> i_layout , src_off ,
2013
+ object_size , & src_objnum ,
2014
+ & src_objoff , & src_objlen );
2015
+ ceph_calc_file_object_mapping (& dst_ci -> i_layout , dst_off ,
2016
+ object_size , & dst_objnum ,
2017
+ & dst_objoff , & dst_objlen );
2018
+ ceph_oid_init (& src_oid );
2019
+ ceph_oid_printf (& src_oid , "%llx.%08llx" ,
2020
+ src_ci -> i_vino .ino , src_objnum );
2021
+ ceph_oid_init (& dst_oid );
2022
+ ceph_oid_printf (& dst_oid , "%llx.%08llx" ,
2023
+ dst_ci -> i_vino .ino , dst_objnum );
2024
+ /* Do an object remote copy */
2025
+ err = ceph_osdc_copy_from (
2026
+ & ceph_inode_to_client (src_inode )-> client -> osdc ,
2027
+ src_ci -> i_vino .snap , 0 ,
2028
+ & src_oid , & src_oloc ,
2029
+ CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
2030
+ CEPH_OSD_OP_FLAG_FADVISE_NOCACHE ,
2031
+ & dst_oid , & dst_oloc ,
2032
+ CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
2033
+ CEPH_OSD_OP_FLAG_FADVISE_DONTNEED , 0 );
2034
+ if (err ) {
2035
+ dout ("ceph_osdc_copy_from returned %d\n" , err );
2036
+ if (!ret )
2037
+ ret = err ;
2038
+ goto out_caps ;
2039
+ }
2040
+ len -= object_size ;
2041
+ src_off += object_size ;
2042
+ dst_off += object_size ;
2043
+ ret += object_size ;
2044
+ }
2045
+
2046
+ if (len )
2047
+ /* We still need one final local copy */
2048
+ do_final_copy = true;
2049
+
2050
+ file_update_time (dst_file );
2051
+ if (endoff > size ) {
2052
+ int caps_flags = 0 ;
2053
+
2054
+ /* Let the MDS know about dst file size change */
2055
+ if (ceph_quota_is_max_bytes_approaching (dst_inode , endoff ))
2056
+ caps_flags |= CHECK_CAPS_NODELAY ;
2057
+ if (ceph_inode_set_size (dst_inode , endoff ))
2058
+ caps_flags |= CHECK_CAPS_AUTHONLY ;
2059
+ if (caps_flags )
2060
+ ceph_check_caps (dst_ci , caps_flags , NULL );
2061
+ }
2062
+ /* Mark Fw dirty */
2063
+ spin_lock (& dst_ci -> i_ceph_lock );
2064
+ dst_ci -> i_inline_version = CEPH_INLINE_NONE ;
2065
+ dirty = __ceph_mark_dirty_caps (dst_ci , CEPH_CAP_FILE_WR , & prealloc_cf );
2066
+ spin_unlock (& dst_ci -> i_ceph_lock );
2067
+ if (dirty )
2068
+ __mark_inode_dirty (dst_inode , dirty );
2069
+
2070
+ out_caps :
2071
+ put_rd_wr_caps (src_ci , src_got , dst_ci , dst_got );
2072
+
2073
+ if (do_final_copy ) {
2074
+ err = do_splice_direct (src_file , & src_off , dst_file ,
2075
+ & dst_off , len , flags );
2076
+ if (err < 0 ) {
2077
+ dout ("do_splice_direct returned %d\n" , err );
2078
+ goto out ;
2079
+ }
2080
+ len -= err ;
2081
+ ret += err ;
2082
+ }
2083
+
2084
+ out :
2085
+ ceph_free_cap_flush (prealloc_cf );
2086
+
2087
+ return ret ;
2088
+ }
2089
+
1798
2090
const struct file_operations ceph_file_fops = {
1799
2091
.open = ceph_open ,
1800
2092
.release = ceph_release ,
@@ -1810,5 +2102,5 @@ const struct file_operations ceph_file_fops = {
1810
2102
.unlocked_ioctl = ceph_ioctl ,
1811
2103
.compat_ioctl = ceph_ioctl ,
1812
2104
.fallocate = ceph_fallocate ,
2105
+ .copy_file_range = ceph_copy_file_range ,
1813
2106
};
1814
-
0 commit comments