diff --git a/CHANGELOG.md b/CHANGELOG.md
index e5e94808..1ea56bf6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,47 @@
+# Version 0.7.18
+
+## Features
+
+* Add `byrow(allequal)` as a special case of `byrow(isequal)`. (This feature need at least Julia 1.8)
+
+## Fixes
+
+* Fix bug in stat routines - some corner cases
+* Fix unnecessary allocations in stat routins
+
+# Version 0.7.17
+
+## Fixes
+
+* Fix a performance issue in `sort` due to the recent change in `Threads.@threads`.
+* Fix the allocation problem in computing `var` and `std` in fast path of `gatherby`.
+* Fix an issue with Julia-latest.
+
+## Performance
+
+* Now we exploit multithreading during gathering observation for huge data sets.
+
+# Version 0.7.16
+
+## Fixes
+
+* Fix a problem that was causing tests fail in Julia 1.9
+* Fix an issue with `eltype` and the output of `eachcol`. Now `eltype(::Type{<:DatasetColumns})` properly returns `AbstractDatasetColumn` instead of `AbstractVector`.
+* Fix a problem with `nonmissingtype` with `Union{}` output.
+* Fix an issue that was causing the join functions sort already-sorted data sets, [issue #108](https://github.com/sl-solution/InMemoryDatasets.jl/issues/108)
+* Remove precompilation for Julia 1.9 - it causes enormous amount of allocation in precompiling and loading 
+
+## Features
+
+* Now `IMD` throws errors when accesses a grouped data set which its parent is modified.
+
+# Version 0.7.15
+
+## Fixes
+
+* Functions `searchsorted`, `searchsortedfirst`, and `searchsortedlast` now works with `DatasetColumn`
+* Fix a bug in `byrow(nunique)`
+
 # Version 0.7.14
 
 ## Fixes
diff --git a/Project.toml b/Project.toml
index 655523e3..5684a27f 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "InMemoryDatasets"
 uuid = "5c01b14b-ab03-46ff-b164-14c663efdd9f"
 authors = ["sl-solution <github.sl.solution@icloud.com> and contributors"]
-version = "0.7.14"
+version = "0.7.21"
 
 [deps]
 Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"
@@ -22,7 +22,7 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 
 [compat]
 Compat = "3.17, 4"
-DataAPI = "1.8"
+DataAPI = "1.16"
 InvertedIndices = "1"
 IteratorInterfaceExtensions = "0.1.1, 1"
 Missings = "0.4.2, 1"
diff --git a/README.md b/README.md
index 22faca67..6da506ab 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@ we do our best to keep the overall complexity of the package as low as possible
 * adding new features to the package
 * contributing to the package
 
-See [here](https://discourse.julialang.org/t/ann-a-new-lightning-fast-package-for-data-manipulation-in-pure-julia/78197) for some benchmarks.
+See [here](https://duckdblabs.github.io/db-benchmark/) for some benchmarks.
 # Features
 
 `InMemoryDatasets.jl` has many interesting features, here, we highlight some of our favourites (in no particular order):
diff --git a/docs/src/index.md b/docs/src/index.md
index 52b028f6..cc7af3a8 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -5,7 +5,7 @@ Welcome to the InMemoryDatasets.jl documentation!
 This resource aims to teach you everything you need to know to get up and
 running with the InMemoryDatasets.jl package.
 
-In memory Datasets is a collection of tools for working (manipulating, wrangling, cleaning, summarising,...) with tabular data in Julia.
+InMemoryDatasets is a collection of tools for working (manipulating, wrangling, cleaning, summarising,...) with tabular data in Julia.
 
 If you are new to InMemoryDatasets.jl, probably **[First steps with Datasets](https://sl-solution.github.io/InMemoryDatasets.jl/stable/man/basics/)** or **[Tutorial](https://sl-solution.github.io/InMemoryDatasets.jl/stable/man/tutorial/)** in manual should be good starting points.
 
diff --git a/docs/src/man/grouping.md b/docs/src/man/grouping.md
index 51dc008c..36074fea 100644
--- a/docs/src/man/grouping.md
+++ b/docs/src/man/grouping.md
@@ -155,6 +155,21 @@ julia> groupby(salary, 2)
        10       2
         3       3
         5       3
+
+julia> ds = Dataset(x=[1,1,2,2], y=[1,2,1,2], z=[1,1,1,1])
+
+julia> groupby!(ds, [:x, :y]) # groupby by more than one column
+4×3 Grouped Dataset with 4 groups
+Grouped by: x, y
+ Row │ x         y         z        
+     │ identity  identity  identity 
+     │ Int64?    Int64?    Int64?   
+─────┼──────────────────────────────
+   1 │        1         1         1
+   2 │        1         2         1
+   3 │        2         1         1
+   4 │        2         2         1
+
 ```
 
 The `groupby!` and `groupby` functions accept the output of the `groupby` function. Thus, some may use these functions to incrementally group a data set.
diff --git a/src/InMemoryDatasets.jl b/src/InMemoryDatasets.jl
index 6dfdcc6a..69fb69a8 100644
--- a/src/InMemoryDatasets.jl
+++ b/src/InMemoryDatasets.jl
@@ -25,6 +25,7 @@ import DataAPI,
        DataAPI.antijoin,
        DataAPI.nrow,
        DataAPI.ncol,
+       DataAPI.groupby,
        # DataAPI.crossjoin,
        Tables,
        Tables.columnindex
@@ -119,7 +120,8 @@ export
       update!
 
 
-
+SMALLSIGNED = Union{Int16, Int32, Int8}
+SMALLUNSIGNED = Union{UInt16, UInt32, UInt8}
 
 
 include("other/index.jl")
@@ -197,7 +199,8 @@ include("stat/ds_stat.jl")
 include("precompile/precompile.jl")
 include("precompile/warmup.jl")
 include("precompile/create_sysimage.jl")
-_precompile()
+# FIXME currently v1.9.0 precompilation and loading cause an enormous amount of allocation - v1.10 seems ok
+VERSION != v"1.9.0" &&  _precompile()
 
 function __init__()
    if Threads.nthreads() == 1
diff --git a/src/abstractdataset/abstractdataset.jl b/src/abstractdataset/abstractdataset.jl
index 28f6b044..549938a5 100644
--- a/src/abstractdataset/abstractdataset.jl
+++ b/src/abstractdataset/abstractdataset.jl
@@ -42,16 +42,18 @@ In broadcasting `AbstractDataset` behavior is similar to a `Matrix`.
 """
 abstract type AbstractDataset end
 
+abstract type AbstractDatasetColumn end
+
 # DatasetColumn is a representation of a column of data set
 # it is wrapped into a new type to make sure that when ever a column is
 # selected, the data set is attached to it
-struct DatasetColumn{T <: AbstractDataset, E}
+struct DatasetColumn{T <: AbstractDataset, E} <: AbstractDatasetColumn
     col::Int
     ds::T
     val::E
 end
 
-struct SubDatasetColumn{T <: AbstractDataset, E}
+struct SubDatasetColumn{T <: AbstractDataset, E} <: AbstractDatasetColumn
     col::Int
     ds::T
     val::E
@@ -308,7 +310,7 @@ function content(ds::AbstractDataset; output = false)
     for i in 1:ncol(ds)
         push!(f_v[1], all_names[i])
         push!(f_v[2], getformat(ds, i))
-        push!(f_v[3], nonmissingtype(eltype(ds[!, i])))
+        push!(f_v[3], our_nonmissingtype(eltype(ds[!, i])))
     end
     format_ds = Dataset(f_v, [:column, :format, :eltype], copycols = false)
     if !output
diff --git a/src/abstractdataset/dscol.jl b/src/abstractdataset/dscol.jl
index b5f9cf9f..be2dc40d 100644
--- a/src/abstractdataset/dscol.jl
+++ b/src/abstractdataset/dscol.jl
@@ -8,6 +8,8 @@ const SubOrDSCol = Union{SubDatasetColumn,DatasetColumn}
 # isequal also use for == , since we don't want missing be annoying
 Base.parent(col1::DatasetColumn) = col1.ds
 
+Base.eachindex(col1::SubOrDSCol) = Base.axes1(col1)
+
 Base.length(col1::SubOrDSCol) = length(__!(col1))
 Base.size(col1::SubOrDSCol) = size(__!(col1))
 Base.size(col1::SubOrDSCol, i::Integer) = size(__!(col1), i)
@@ -18,12 +20,14 @@ Base.eltype(col1::SubOrDSCol) = eltype(__!(col1))
 Base.ndims(col1::SubOrDSCol) = ndims(__!(col1))
 Base.ndims(::Type{<:SubDatasetColumn}) = 1
 Base.isassigned(col1::SubOrDSCol, i) = isassigned(__!(col1), i)
+# FIXME: unsafe method - an alias of col1 is out and it can be modified without any control
 Base.identity(col1::SubOrDSCol) = identity(__!(col1))
 Base.similar(col1::SubOrDSCol, args...) = similar(__!(col1), args...)
 Base.copy(col1::SubOrDSCol) = copy(__!(col1))
 Base.pairs(col1::SubOrDSCol) = pairs(IndexLinear(), __!(col1))
 Base.iterate(col1::SubOrDSCol, kwargs...) = iterate(__!(col1), kwargs...)
 PooledArrays.PooledArray(col1::SubOrDSCol; arg...) = PooledArray(__!(col1); arg...)
+# FIXME: unsafe when alias are created
 Base.convert(T::Type{<:AbstractVector}, col1::SubOrDSCol) = convert(T, __!(col1))
 DataAPI.refarray(col::SubOrDSCol) = DataAPI.refarray(__!(col))
 DataAPI.refpool(col::SubOrDSCol) = DataAPI.refpool(__!(col))
@@ -160,3 +164,8 @@ function Base.sort!(col::SubOrDSCol; kws...)
 end
 Base.sort(col::SubOrDSCol; kws...) = sort(__!(col); kws...)
 Base.sortperm(col::SubOrDSCol; kws...) = sortperm(__!(col); kws...)
+
+Base.searchsortedfirst(col::SubOrDSCol, x; kws...) = searchsortedfirst(__!(col), x; kws...)
+Base.searchsortedlast(col::SubOrDSCol, x; kws...) = searchsortedlast(__!(col), x; kws...)
+Base.searchsorted(col::SubOrDSCol, x; kws...) = searchsorted(__!(col), x; kws...)
+
diff --git a/src/abstractdataset/iteration.jl b/src/abstractdataset/iteration.jl
index 987a54b5..b3156936 100644
--- a/src/abstractdataset/iteration.jl
+++ b/src/abstractdataset/iteration.jl
@@ -234,7 +234,7 @@ Base.ndims(::DatasetColumns) = 1
 Base.ndims(::Type{<:DatasetColumns}) = 1
 
 Base.length(itr::DatasetColumns) = size(itr)[1]
-Base.eltype(::Type{<:DatasetColumns}) = AbstractVector
+Base.eltype(::Type{<:DatasetColumns}) = AbstractDatasetColumn
 
 Base.firstindex(itr::DatasetColumns) = 1
 Base.lastindex(itr::DatasetColumns) = length(itr)
@@ -394,7 +394,10 @@ Base.show(dfcs::DatasetColumns;
 # prevent using broadcasting to mutate columns e.g. in pop!.(eachcol(ds))
 # TODO customise Base.broadcasted to handle the situation
 for f in filter(x->occursin(r"!$", String(x)), names(Base))
-  @eval Base.broadcasted(::typeof($f), ::DatasetColumns, args...) = throw(ArgumentError("broadcasting `$(nameof($f))` over DatasetColums is reserved."))
+    # FIXME due to a bug in Julia > 1.11 !?
+    if isdefined(Main, f)
+        @eval Base.broadcasted(::typeof($f), ::DatasetColumns, args...) = throw(ArgumentError("broadcasting `$(nameof($f))` over DatasetColums is reserved."))
+    end
 end
 for f in filter(x->occursin(r"!$", String(x)), names(Statistics))
   @eval Base.broadcasted(::typeof($f), ::DatasetColumns, args...) = throw(ArgumentError("broadcasting `$(nameof($f))` over DatasetColums is reserved."))
diff --git a/src/abstractdataset/show.jl b/src/abstractdataset/show.jl
index a1f3c4a5..8911bb01 100644
--- a/src/abstractdataset/show.jl
+++ b/src/abstractdataset/show.jl
@@ -118,7 +118,7 @@ function compacttype(T::Type, maxwidth::Int=8)
     textwidth(sT) ≤ maxwidth && return sT
 
     if T >: Missing
-        T = nonmissingtype(T)
+        T = our_nonmissingtype(T)
         sT = string(T)
         suffix = "?"
         textwidth(sT) ≤ maxwidth && return sT * suffix
@@ -223,7 +223,7 @@ function _show(io::IO,
     alignment_regex_complex = [r"(?<!^)(?<!e)[+-]"]
 
     for i = 1:num_cols
-        type_i = nonmissingtype(types[i])
+        type_i = our_nonmissingtype(types[i])
 
         if type_i <: Complex
             alignment_anchor_regex[i] = alignment_regex_complex
@@ -291,7 +291,7 @@ function _show(io::IO,
                  body_hlines                 = extrahlines,
                  compact_printing            = compact_printing,
                  crop                        = crop,
-                 crop_num_lines_at_beginning = 2,
+                 reserved_display_lines      = 2,
                  ellipsis_line_skip          = 3,
                  formatters                  = pt_formatter,
                  header                      = (names_str, names_format, types_str),
diff --git a/src/byrow/byrow.jl b/src/byrow/byrow.jl
index 86d51285..d26809b0 100644
--- a/src/byrow/byrow.jl
+++ b/src/byrow/byrow.jl
@@ -111,6 +111,12 @@ byrow(ds::AbstractDataset, ::typeof(all), col::ColumnIndex; missings = missing,
 byrow(ds::AbstractDataset, ::typeof(isequal), cols::MultiColumnIndex; with = nothing, threads = nrow(ds) > Threads.nthreads()*10) = row_isequal(ds, cols, by = with, threads = threads)
 byrow(ds::AbstractDataset, ::typeof(isequal), cols::ColumnIndex; with = nothing, threads = nrow(ds) > Threads.nthreads()*10) = row_isequal(ds, cols, by = with, threads = threads)
 
+if VERSION >= v"1.8"
+	byrow(ds::AbstractDataset, ::typeof(allequal), cols::MultiColumnIndex; threads = nrow(ds) > Threads.nthreads()*10) = row_isequal(ds, cols, by = nothing, threads = threads)
+	byrow(ds::AbstractDataset, ::typeof(allequal), cols::ColumnIndex;  threads = nrow(ds) > Threads.nthreads()*10) = row_isequal(ds, cols, by = nothing, threads = threads)
+end
+
+
 byrow(ds::AbstractDataset, ::typeof(isless), cols::MultiColumnIndex; with, threads = nrow(ds) > Threads.nthreads()*10, rev::Bool = false, lt = isless) = row_isless(ds, cols, with, threads = threads, rev = rev, lt = lt)
 byrow(ds::AbstractDataset, ::typeof(isless), col::ColumnIndex; with, threads = nrow(ds) > Threads.nthreads()*10, rev::Bool = false, lt = isless) = row_isless(ds, [col], with, threads = threads, rev = rev, lt = lt)
 
@@ -167,7 +173,14 @@ byrow(ds::AbstractDataset, ::typeof(var), col::ColumnIndex; by = identity, dof =
 byrow(ds::AbstractDataset, ::typeof(std), cols::MultiColumnIndex = names(ds, Union{Missing, Number}); by = identity, dof = true, threads = nrow(ds) > Threads.nthreads()*10) = row_std(ds, by, cols; dof = dof, threads = threads)
 byrow(ds::AbstractDataset, ::typeof(std), col::ColumnIndex; by = identity, dof = true, threads = nrow(ds) > Threads.nthreads()*10) = byrow(ds, std, [col]; by = by, dof = dof, threads = threads)
 
-byrow(ds::AbstractDataset, ::typeof(nunique), cols::MultiColumnIndex = names(ds, Union{Missing, Number}); by = identity, count_missing = true) = row_nunique(ds, by, cols; count_missing = count_missing)
+function byrow(ds::AbstractDataset, ::typeof(nunique), cols::MultiColumnIndex = names(ds, Union{Missing, Number}); by = identity, count_missing = true, threads=nrow(ds)>1000) 
+	res = byrow(ds, x->length(Set(Base.Generator(by, x))), cols, threads=threads)
+	if count_missing
+		return res
+	else
+		return res .- row_any(ds, ismissing, cols)
+	end
+end
 byrow(ds::AbstractDataset, ::typeof(nunique), col::ColumnIndex; by = identity, count_missing = true) = byrow(ds, nunique, [col]; by = by, count_missing = count_missing)
 
 
@@ -254,7 +267,7 @@ end
 
 function byrow(ds::AbstractDataset, f::Function, col::ColumnIndex; threads = nrow(ds)>1000, allowmissing::Bool = true)
 	if threads
-		T = Core.Compiler.return_type(f, Tuple{nonmissingtype(eltype(ds[!, col]))})
+		T = Core.Compiler.return_type(f, Tuple{our_nonmissingtype(eltype(ds[!, col]))})
 		if allowmissing
 			res = Vector{Union{Missing, T}}(undef, nrow(ds))
 		else
@@ -262,7 +275,7 @@ function byrow(ds::AbstractDataset, f::Function, col::ColumnIndex; threads = nro
 		end
 		_hp_map_a_function!(res, f, _columns(ds)[index(ds)[col]])
 	else
-		T = Core.Compiler.return_type(f, Tuple{nonmissingtype(eltype(ds[!, col]))})
+		T = Core.Compiler.return_type(f, Tuple{our_nonmissingtype(eltype(ds[!, col]))})
 		if allowmissing
 			res = Vector{Union{Missing, T}}(undef, nrow(ds))
 		else
diff --git a/src/byrow/doc.jl b/src/byrow/doc.jl
index 746ac473..a7a1f793 100644
--- a/src/byrow/doc.jl
+++ b/src/byrow/doc.jl
@@ -16,6 +16,8 @@ function Docs.getdoc(x::typeof(byrow), y)
         return _get_doc_byrow("prod")
     elseif y == Tuple{typeof(isequal)}
         return _get_doc_byrow("isequal")
+    elseif VERSION >= v"1.8" && y == Tuple{typeof(allequal)}
+        return _get_doc_byrow("allequal")
     elseif y == Tuple{typeof(isless)}
         return _get_doc_byrow("isless")
     elseif y == Tuple{typeof(in)}
@@ -105,6 +107,7 @@ Perform a row-wise operation specified by `fun` on selected columns `cols`. Gene
 # Reduction operations
 
 - `all`
+- `allequal` (this needs Julia 1.8 or later)
 - `any`
 - `argmax`
 - `argmin`
@@ -369,6 +372,14 @@ julia> byrow(ds, isequal, [1,2], with = [2,2,2,3,3,3])
  0
  0
 ```
+@@@@allequal@@@@
+    byrow(ds::AbstractDataset, allequal, cols; [threads])
+
+Returns a boolean vector which is `true` if all values in the corresponding row are equal (using `isequal`).
+
+Passing `threads = false` disables multithreaded computations.
+
+See [`byrow(isequal)`](@ref), [`byrow(isless)`](@ref), [`byrow(in)`](@ref), [`byrow(issorted)`](@ref)
 @@@@isless@@@@
     byrow(ds::AbstractDataset, isless, cols, [with, threads, rev = false, lt = isless])
 
diff --git a/src/byrow/hp_row_functions.jl b/src/byrow/hp_row_functions.jl
index 371e0cc3..40abfda5 100644
--- a/src/byrow/hp_row_functions.jl
+++ b/src/byrow/hp_row_functions.jl
@@ -1,6 +1,6 @@
 function hp_row_sort!(ds::Dataset, cols = names(ds, Union{Missing, Number}); kwargs...)
     colsidx = index(ds)[cols]
-    T = mapreduce(eltype, promote_type, eachcol(ds)[colsidx])
+    T = mapreduce(eltype, promote_type, view(_columns(ds),colsidx))
     m = Matrix{T}(ds[!, colsidx])
     Threads.@threads for i in 1:size(m, 1)
         @views sort!(m[i, :]; kwargs...)
@@ -92,7 +92,7 @@ function _hp_row_generic_vec!(res, ds, f, colsidx, ::Val{T}) where T
     max_cz = length(res) - 1000 - (loopsize - 1)*1000
     inmat_all = [Matrix{T}(undef, length(colsidx), max_cz) for i in 1:nt]
     # make sure that the variable inside the loop are not the same as the out of scope one
-    Threads.@threads for i in 1:loopsize
+    Threads.@threads :static for i in 1:loopsize
         t_st = i*1000 + 1
         i == loopsize ? t_en = length(res) : t_en = (i+1)*1000
         _fill_matrix!(inmat_all[Threads.threadid()], all_data, t_st:t_en, colsidx)
diff --git a/src/byrow/row_functions.jl b/src/byrow/row_functions.jl
index f7f4058e..2d7df23a 100644
--- a/src/byrow/row_functions.jl
+++ b/src/byrow/row_functions.jl
@@ -33,9 +33,9 @@ function row_sum(ds::AbstractDataset, f::Function,  cols = names(ds, Union{Missi
     colsidx = multiple_getindex(index(ds), cols)
     CT = mapreduce(eltype, promote_type, view(_columns(ds),colsidx))
     T = Core.Compiler.return_type(f, Tuple{CT})
-	CT = nonmissingtype(T)
-	CT <: Base.SmallSigned ? CT = Int : nothing
-	CT <: Base.SmallUnsigned ? CT = UInt : nothing
+	CT = our_nonmissingtype(T)
+	CT <: SMALLSIGNED ? CT = Int : nothing
+	CT <: SMALLUNSIGNED ? CT = UInt : nothing
 	CT <: Bool ? CT = Int : nothing
 	T = Union{Missing, CT}
     init0 = _missings(T, nrow(ds))
@@ -68,9 +68,9 @@ function row_prod(ds::AbstractDataset, f::Function, cols = names(ds, Union{Missi
     colsidx = multiple_getindex(index(ds), cols)
     CT = mapreduce(eltype, promote_type, view(_columns(ds),colsidx))
     T = Core.Compiler.return_type(f, Tuple{CT})
-	CT = nonmissingtype(T)
-	CT <: Base.SmallSigned ? CT = Int : nothing
-	CT <: Base.SmallUnsigned ? CT = UInt : nothing
+	CT = our_nonmissingtype(T)
+	CT <: SMALLSIGNED ? CT = Int : nothing
+	CT <: SMALLUNSIGNED ? CT = UInt : nothing
 	CT <: Bool ? CT = Int : nothing
 	T = Union{Missing, CT}
     init0 = _missings(T, nrow(ds))
@@ -744,9 +744,9 @@ function row_cumsum!(ds::Dataset, cols = names(ds, Union{Missing, Number}); miss
     colsidx = index(ds)[cols]
     T = mapreduce(eltype, promote_type, view(_columns(ds),colsidx))
     if T <: Union{Missing, INTEGERS}
-        T <: Union{Missing, Base.SmallSigned}
-        T = T <: Union{Missing, Base.SmallSigned, Bool} ? Union{Int, Missing} : T
-        T = T <: Union{Missing, Base.SmallUnsigned} ?  Union{Missing, UInt} : T
+        T <: Union{Missing, SMALLSIGNED}
+        T = T <: Union{Missing, SMALLSIGNED, Bool} ? Union{Int, Missing} : T
+        T = T <: Union{Missing, SMALLUNSIGNED} ?  Union{Missing, UInt} : T
     end
     for i in colsidx
         if eltype(ds[!, i]) >: Missing
@@ -1004,7 +1004,7 @@ end
 
 function row_sort!(ds::Dataset, cols = names(ds, Union{Missing, Number}); kwargs...)
     colsidx = index(ds)[cols]
-    T = mapreduce(eltype, promote_type, eachcol(ds)[colsidx])
+    T = mapreduce(eltype, promote_type, view(_columns(ds),colsidx))
     m = Matrix{T}(ds[!, colsidx])
     sort!(m; dims = 2, kwargs...)
     for i in 1:length(colsidx)
@@ -1077,25 +1077,26 @@ function _fill_dict_and_add!(init0, dict, prehashed, n, p)
     end
 end
 
-function row_nunique(ds::AbstractDataset, f::Function, cols = names(ds, Union{Missing, Number}); count_missing = true)
-    colsidx = multiple_getindex(index(ds), cols)
-    prehashed = Matrix{_Prehashed}(undef, size(ds,1), length(colsidx))
-    allcols = view(_columns(ds),colsidx)
+# This is not working - because we only the hash values and in many cases like 2.1 and 4611911198408756429 the hash is the same
+# function row_nunique(ds::AbstractDataset, f::Function, cols = names(ds, Union{Missing, Number}); count_missing = true)
+#     colsidx = multiple_getindex(index(ds), cols)
+#     prehashed = Matrix{_Prehashed}(undef, size(ds,1), length(colsidx))
+#     allcols = view(_columns(ds),colsidx)
 
-    for j in 1:size(prehashed,2)
-        _fill_prehashed!(prehashed, allcols[j], f, size(ds,1), j)
-    end
+#     for j in 1:size(prehashed,2)
+#         _fill_prehashed!(prehashed, allcols[j], f, size(ds,1), j)
+#     end
 
-    init0 = zeros(Int32, size(ds,1))
-    dict = Dict{_Prehashed, Nothing}()
-    _fill_dict_and_add!(init0, dict, prehashed, size(ds,1), length(colsidx))
-    if count_missing
-        return init0
-    else
-        return init0 .- row_any(ds, ismissing, cols)
-    end
-end
-row_nunique(ds::AbstractDataset, cols = names(ds, Union{Missing, Number}); count_missing = true) = row_nunique(ds, identity, cols; count_missing = count_missing)
+#     init0 = zeros(Int32, size(ds,1))
+#     dict = Dict{_Prehashed, Nothing}()
+#     _fill_dict_and_add!(init0, dict, prehashed, size(ds,1), length(colsidx))
+#     if count_missing
+#         return init0
+#     else
+#         return init0 .- row_any(ds, ismissing, cols)
+#     end
+# end
+# row_nunique(ds::AbstractDataset, cols = names(ds, Union{Missing, Number}); count_missing = true) = row_nunique(ds, identity, cols; count_missing = count_missing)
 
 Base.@propagate_inbounds function _op_for_hash!(x, y, f, lo, hi)
     @simd for i in lo:hi
diff --git a/src/byrow/util.jl b/src/byrow/util.jl
index f77a5e47..a42d2814 100644
--- a/src/byrow/util.jl
+++ b/src/byrow/util.jl
@@ -296,6 +296,11 @@ end
     return pos
 end
 
+# before Julia 1.10 these functions where defined in Ryu, however, they moved to Base and their syntax has changed.
+# we only use them here so we define them for our purpose
+_memcpy(d, doff, s, soff, n) = (ccall(:memcpy, Ptr{Cvoid}, (Ptr{Cvoid}, Ptr{Cvoid}, Csize_t), d + doff - 1, s + soff - 1, n); nothing)
+_memmove(d, doff, s, soff, n) = (ccall(:memmove, Ptr{Cvoid}, (Ptr{Cvoid}, Ptr{Cvoid}, Csize_t), d + doff - 1, s + soff - 1, n); nothing)
+
 ### From Base.Ryu, because we need buf to be View of an array not vector (maybe we should change it in Ryu?)
 function _writeshortest(buf, pos, x::T,
                        plus=false, space=false, hash=true,
@@ -423,7 +428,7 @@ function _writeshortest(buf, pos, x::T,
     end
     i = 0
     ptr = pointer(buf)
-    ptr2 = pointer(Base.Ryu.DIGIT_TABLE)
+    ptr2 = pointer(our_DIGIT_TABLE)
     if (output >> 32) != 0
         q = output ÷ 100000000
         output2 = (output % UInt32) - UInt32(100000000) * (q % UInt32)
@@ -436,10 +441,10 @@ function _writeshortest(buf, pos, x::T,
         c1 = (c ÷ 100) << 1
         d0 = (d % 100) << 1
         d1 = (d ÷ 100) << 1
-        Base.Ryu.memcpy(ptr, pos + olength - 2, ptr2, c0 + 1, 2)
-        Base.Ryu.memcpy(ptr, pos + olength - 4, ptr2, c1 + 1, 2)
-        Base.Ryu.memcpy(ptr, pos + olength - 6, ptr2, d0 + 1, 2)
-        Base.Ryu.memcpy(ptr, pos + olength - 8, ptr2, d1 + 1, 2)
+        _memcpy(ptr, pos + olength - 2, ptr2, c0 + 1, 2)
+        _memcpy(ptr, pos + olength - 4, ptr2, c1 + 1, 2)
+        _memcpy(ptr, pos + olength - 6, ptr2, d0 + 1, 2)
+        _memcpy(ptr, pos + olength - 8, ptr2, d1 + 1, 2)
         i += 8
     end
     output2 = output % UInt32
@@ -448,20 +453,20 @@ function _writeshortest(buf, pos, x::T,
         output2 = div(output2, UInt32(10000))
         c0 = (c % 100) << 1
         c1 = (c ÷ 100) << 1
-        Base.Ryu.memcpy(ptr, pos + olength - i - 2, ptr2, c0 + 1, 2)
-        Base.Ryu.memcpy(ptr, pos + olength - i - 4, ptr2, c1 + 1, 2)
+        _memcpy(ptr, pos + olength - i - 2, ptr2, c0 + 1, 2)
+        _memcpy(ptr, pos + olength - i - 4, ptr2, c1 + 1, 2)
         i += 4
     end
     if output2 >= 100
         c = (output2 % UInt32(100)) << 1
         output2 = div(output2, UInt32(100))
-        Base.Ryu.memcpy(ptr, pos + olength - i - 2, ptr2, c + 1, 2)
+        _memcpy(ptr, pos + olength - i - 2, ptr2, c + 1, 2)
         i += 2
     end
     if output2 >= 10
         c = output2 << 1
-        buf[pos + 1] = Base.Ryu.DIGIT_TABLE[c + 2]
-        buf[pos - exp_form] = Base.Ryu.DIGIT_TABLE[c + 1]
+        buf[pos + 1] = our_DIGIT_TABLE[c + 2]
+        buf[pos - exp_form] = our_DIGIT_TABLE[c + 1]
     else
         buf[pos - exp_form] = UInt8('0') + (output2 % UInt8)
     end
@@ -498,7 +503,7 @@ function _writeshortest(buf, pos, x::T,
             end
         else
             pointoff = olength - abs(nexp)
-            Base.Ryu.memmove(ptr, pos + pointoff + 1, ptr, pos + pointoff, olength - pointoff + 1)
+            _memmove(ptr, pos + pointoff + 1, ptr, pos + pointoff, olength - pointoff + 1)
             buf[pos + pointoff] = decchar
             pos += olength + 1
             precision -= olength
@@ -543,11 +548,11 @@ function _writeshortest(buf, pos, x::T,
 
         if exp2 >= 100
             c = exp2 % 10
-            Base.Ryu.memcpy(ptr, pos, ptr2, 2 * div(exp2, 10) + 1, 2)
+            _memcpy(ptr, pos, ptr2, 2 * div(exp2, 10) + 1, 2)
             buf[pos + 2] = UInt8('0') + (c % UInt8)
             pos += 3
         elseif exp2 >= 10
-            Base.Ryu.memcpy(ptr, pos, ptr2, 2 * exp2 + 1, 2)
+            _memcpy(ptr, pos, ptr2, 2 * exp2 + 1, 2)
             pos += 2
         else
             if padexp
@@ -565,3 +570,17 @@ function _writeshortest(buf, pos, x::T,
 
     return pos
 end
+
+# FIXME in versions > 1.11 julia has change DIGIT_TABLE, we nee to update this for our purpose
+const our_DIGIT_TABLE = UInt8[
+  '0','0','0','1','0','2','0','3','0','4','0','5','0','6','0','7','0','8','0','9',
+  '1','0','1','1','1','2','1','3','1','4','1','5','1','6','1','7','1','8','1','9',
+  '2','0','2','1','2','2','2','3','2','4','2','5','2','6','2','7','2','8','2','9',
+  '3','0','3','1','3','2','3','3','3','4','3','5','3','6','3','7','3','8','3','9',
+  '4','0','4','1','4','2','4','3','4','4','4','5','4','6','4','7','4','8','4','9',
+  '5','0','5','1','5','2','5','3','5','4','5','5','5','6','5','7','5','8','5','9',
+  '6','0','6','1','6','2','6','3','6','4','6','5','6','6','6','7','6','8','6','9',
+  '7','0','7','1','7','2','7','3','7','4','7','5','7','6','7','7','7','8','7','9',
+  '8','0','8','1','8','2','8','3','8','4','8','5','8','6','8','7','8','8','8','9',
+  '9','0','9','1','9','2','9','3','9','4','9','5','9','6','9','7','9','8','9','9'
+]
\ No newline at end of file
diff --git a/src/dataset/getindex.jl b/src/dataset/getindex.jl
index 2fa9645d..3894eeb6 100644
--- a/src/dataset/getindex.jl
+++ b/src/dataset/getindex.jl
@@ -61,6 +61,8 @@ function _check_consistency(ds::Dataset)
 end
 
 function _check_consistency(ds::AbstractDataset)
+    # FIXME We should check the created date of sub-data, however, it is not working in some situations, e.g. modify!(sds, ...) 
+    # TODO However, We should add this whenever it is possible :   getfield(ds, :created) == _get_lastmodified(_attributes(parent(ds)))
     if ds isa SubDataset
         @assert length(index(ds).remap) == length(index(parent(ds))) "The parent data set which this view is based on, has been modified. To fix the issue recreate the view"
     end
diff --git a/src/dataset/modify.jl b/src/dataset/modify.jl
index 6f0afe56..adcc20dd 100644
--- a/src/dataset/modify.jl
+++ b/src/dataset/modify.jl
@@ -255,7 +255,7 @@ function normalize_modify!(outidx::Index, idx,
                             @nospecialize(sel::Pair{<:ColumnIndex,
                                                     <:Vector{<:Base.Callable}}))
     colsidx = outidx[sel.first]
-    normalize_modify!(outidx, idx, colsidx .=> sel.second[i])
+    normalize_modify!(outidx, idx, colsidx .=> sel.second)
     return res
 end
 
@@ -597,7 +597,7 @@ end
 
 # the number of destination can be smaller or greater than the number of elements of Tuple,
 function _modify_multiple_out!(ds, x, dst)
-    !(nonmissingtype(eltype(x)) <: Tuple) && throw(ArgumentError("to use `splitter`, the source column must be a vector of Tuple"))
+    !(our_nonmissingtype(eltype(x)) <: Tuple) && throw(ArgumentError("to use `splitter`, the source column must be a vector of Tuple"))
     tb = Tables.columntable(x)
     for j in 1:length(dst)
         try
diff --git a/src/dataset/other.jl b/src/dataset/other.jl
index d5ce1e6d..66c9c217 100755
--- a/src/dataset/other.jl
+++ b/src/dataset/other.jl
@@ -492,7 +492,7 @@ function Base.map!(ds::AbstractDataset, f::Vector{<:Function}, cols::MultiColumn
         # Core.Compiler.return_type cannot handle the situations like x->ismissing(x) ? 0 : x when x is missing and float, since the output of Core.Compiler.return_type is Union{Missing, Float64, Int64}
         # we remove missing and then check the result,
         # TODO is there any problem with this?
-        T = Core.Compiler.return_type(f[j], Tuple{nonmissingtype(CT)})
+        T = Core.Compiler.return_type(f[j], Tuple{our_nonmissingtype(CT)})
         T = Union{Missing, T}
         if promote_type(T, CT) <: CT
             if threads && DataAPI.refpool(_columns(ds)[colsidx[j]]) === nothing
diff --git a/src/dataset/transpose.jl b/src/dataset/transpose.jl
index b1effdd5..d132a3d2 100644
--- a/src/dataset/transpose.jl
+++ b/src/dataset/transpose.jl
@@ -408,7 +408,7 @@ end
 
 function _fill_outputmat_withoutid(T, in_cols, ds, starts, perms, new_col_names, row_names_length, threads; default_fill = missing)
 
-    @assert _check_allocation_limit(nonmissingtype(T), row_names_length*_ngroups(ds), length(new_col_names)) < 1.0 "The output data set is huge and there is not enough resource, check the passed arguments."
+    @assert _check_allocation_limit(our_nonmissingtype(T), row_names_length*_ngroups(ds), length(new_col_names)) < 1.0 "The output data set is huge and there is not enough resource, check the passed arguments."
     CT = promote_type(T, typeof(default_fill))
     # outputmat = [__fill!(_our_vect_alloc(CT, row_names_length*_ngroups(ds)), default_fill) for _ in 1:length(new_col_names)]
     outputmat = Vector{typeof(_our_vect_alloc(CT, 0))}(undef, length(new_col_names))
@@ -420,7 +420,7 @@ end
 
 function _fill_outputmat_withid(T, in_cols, ds, starts, perms, ids, new_col_names, row_names_length, threads; default_fill = missing)
 
-    @assert _check_allocation_limit(nonmissingtype(T), row_names_length*_ngroups(ds), length(new_col_names)) < 1.0 "The output data set is huge and there is not enough resource, check the passed arguments."
+    @assert _check_allocation_limit(our_nonmissingtype(T), row_names_length*_ngroups(ds), length(new_col_names)) < 1.0 "The output data set is huge and there is not enough resource, check the passed arguments."
     CT = promote_type(T, typeof(default_fill))
     # outputmat = [fill!(_our_vect_alloc(CT, row_names_length*_ngroups(ds)), default_fill) for _ in 1:length(new_col_names)]
     outputmat = Vector{typeof(_our_vect_alloc(CT, 0))}(undef, length(new_col_names))
@@ -787,7 +787,7 @@ function flatten!(ds::Dataset,
         for col in 2:length(idxcols)
              if mapformats
                  f_fmt = getformat(ds, idxcols[col])
-                 push!(all_idxcols, byrow(ds, f_fmt, idxcols[col]), threads = threads)
+                 push!(all_idxcols, byrow(ds, f_fmt, idxcols[col], threads = threads))
              else
                  push!(all_idxcols, _columns(ds)[idxcols[col]])
              end
@@ -854,7 +854,7 @@ function flatten(ds::AbstractDataset,
         for col in 2:length(idxcols)
              if mapformats
                  f_fmt = getformat(ds, idxcols[col])
-                 push!(all_idxcols, byrow(ds, f_fmt, idxcols[col]), threads = threads)
+                 push!(all_idxcols, byrow(ds, f_fmt, idxcols[col], threads = threads))
              else
                  push!(all_idxcols, _columns(ds)[idxcols[col]])
              end
diff --git a/src/join/closejoin.jl b/src/join/closejoin.jl
index c969dae9..64799912 100644
--- a/src/join/closejoin.jl
+++ b/src/join/closejoin.jl
@@ -274,7 +274,8 @@ function  _fill_right_cols_table_close!(_res, x, ranges, total, borderval, fill_
 
 end
 
-function _change_refpool_find_range_for_close!(ranges, dsl, dsr, r_perms, oncols_left, oncols_right, direction, lmf, rmf, j; nsfpaj = true, threads = true)
+function _change_refpool_find_range_for_close!(ranges, dsl, dsr, r_perms, oncols_left, oncols_right, direction, lmf, rmf, j; nsfpaj=nsfpaj, threads = true)
+    nsfpaj_in = nsfpaj[1]
     var_l = _columns(dsl)[oncols_left[j]]
     var_r = _columns(dsr)[oncols_right[j]]
     l_idx = oncols_left[j]
@@ -292,8 +293,8 @@ function _change_refpool_find_range_for_close!(ranges, dsl, dsr, r_perms, oncols
 
     T1 = Core.Compiler.return_type(_fl, Tuple{eltype(var_l)})
 
-    if DataAPI.refpool(var_r) !== nothing && nsfpaj
-        true && throw(ErrorException("we shouldn't end up here"))
+    if DataAPI.refpool(var_r) !== nothing && nsfpaj_in
+        throw(ErrorException("we shouldn't end up here"))
     else
         T2 = Core.Compiler.return_type(_fr, Tuple{eltype(var_r)})
         if direction == :backward
@@ -327,10 +328,10 @@ function _join_closejoin(dsl, dsr::AbstractDataset, ::Val{T}; onleft, onright, m
         throw(ArgumentError("duplicate column names, pass `makeunique = true` to make them unique using a suffix automatically." ))
     end
 
-    nsfpaj = true
+    nsfpaj = [true]
     # if the column for close join is a PA we cannot use the fast path
     if DataAPI.refpool(_columns(dsr)[oncols_right[end]]) !== nothing
-        nsfpaj = false
+        nsfpaj = [false]
     end
     if length(oncols_left) > 1 && method == :hash
         ranges, a, idx, minval, reps, sz, right_cols_2= _find_ranges_for_join_using_hash(dsl, dsr, onleft[1:end-1], onright[1:end-1], mapformats, true, Val(T), threads = threads)
diff --git a/src/join/join.jl b/src/join/join.jl
index 8e0ac90a..4f966e6c 100644
--- a/src/join/join.jl
+++ b/src/join/join.jl
@@ -70,48 +70,63 @@ function _fill_range_for_accelerated_join!(ranges, starts, loc, x, f, sz, chunk;
     end
 end
 # TODO how the hashing behave for Categorical Arrays?
-function _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate, chunk = 2^10; nsfpaj = true, threads = true)
+function _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate, chunk = 2^10; nsfpaj=nsfpaj, threads = true)
+    # nsfpaj has no value by default to make sure caller passes it
+    # we use vector to represent nsfpaj, because we may override its value
+    nsfpaj_in = nsfpaj[1]
+
     if isempty(dsr)
         idx = []
         fill!(ranges, 1:nrow(dsr))
         last_valid_range = -1
     else
-        if accelerate
-            if mapformats[2]
-                _fr = getformat(dsr, oncols_right[1])
-            else
-                _fr = identity
-            end
-            grng = _divide_for_fast_join(_columns(dsr)[oncols_right[1]], _fr, chunk; threads = threads)
-            if mapformats[1]
-                _fl = getformat(dsl, oncols_left[1])
-            else
-                _fl = identity
-            end
-            _fill_range_for_accelerated_join!(ranges, grng.starts, grng.starts_loc, _columns(dsl)[oncols_left[1]], _fl, nrow(dsr), chunk; threads = threads)
-            if dsr isa SubDataset
-                starts, idx, last_valid_range =  _sortperm_v(dsr, oncols_right, stable = stable, a = alg, mapformats = mapformats[2], notsortpaforjoin = nsfpaj, givenrange = grng, threads = threads)
-
-            else
-                starts, idx, last_valid_range =  _sortperm(dsr, oncols_right, stable = stable, a = alg, mapformats = mapformats[2], notsortpaforjoin = nsfpaj, givenrange = grng, threads = threads)
-            end
+         # check if data already sorted, if so it overrides accelerate 
+        if _check_for_fast_sort(dsr, oncols_right, fill(false, length(oncols_right)), mapformats[2]; notsortpaforjoin = false, givenrange = nothing) == 0
+            # if it is already sorted based on what we want we can saftly change nsfpaj to false
+            nsfpaj[1] = false
+            idx = 1:nrow(dsr)
+            last_valid_range = _ngroups(dsr)
+            fill!(ranges, 1:nrow(dsr))
         else
-            if dsr isa SubDataset
-                starts, idx, last_valid_range =  _sortperm_v(dsr, oncols_right, stable = stable, a = alg, mapformats = mapformats[2], notsortpaforjoin = nsfpaj, threads = threads)
+
+            if accelerate
+                if mapformats[2]
+                    _fr = getformat(dsr, oncols_right[1])
+                else
+                    _fr = identity
+                end
+                grng = _divide_for_fast_join(_columns(dsr)[oncols_right[1]], _fr, chunk; threads = threads)
+                if mapformats[1]
+                    _fl = getformat(dsl, oncols_left[1])
+                else
+                    _fl = identity
+                end
+                _fill_range_for_accelerated_join!(ranges, grng.starts, grng.starts_loc, _columns(dsl)[oncols_left[1]], _fl, nrow(dsr), chunk; threads = threads)
+                if dsr isa SubDataset
+                    starts, idx, last_valid_range =  _sortperm_v(dsr, oncols_right, stable = stable, a = alg, mapformats = mapformats[2], notsortpaforjoin = nsfpaj_in, givenrange = grng, threads = threads)
+
+                else
+                    starts, idx, last_valid_range =  _sortperm(dsr, oncols_right, stable = stable, a = alg, mapformats = mapformats[2], notsortpaforjoin = nsfpaj_in, givenrange = grng, threads = threads)
+                end
             else
-                starts, idx, last_valid_range =  _sortperm(dsr, oncols_right, stable = stable, a = alg, mapformats = mapformats[2], notsortpaforjoin = nsfpaj, threads = threads)
+                if dsr isa SubDataset
+                    starts, idx, last_valid_range =  _sortperm_v(dsr, oncols_right, stable = stable, a = alg, mapformats = mapformats[2], notsortpaforjoin = nsfpaj_in, threads = threads)
+                else
+                    starts, idx, last_valid_range =  _sortperm(dsr, oncols_right, stable = stable, a = alg, mapformats = mapformats[2], notsortpaforjoin = nsfpaj_in, threads = threads)
+                end
+                fill!(ranges, 1:nrow(dsr))
             end
-            fill!(ranges, 1:nrow(dsr))
         end
     end
     idx, last_valid_range == length(idx)
 end
 
 function _sort_for_join_after_hash(dsr, oncols_right, stable, alg, mapformats, nsfpaj, grng; threads = true)
+    nsfpaj_in = nsfpaj[1]
     if dsr isa SubDataset
-        starts, idx, last_valid_range =  _sortperm_v(dsr, oncols_right, stable = stable, a = alg, mapformats = mapformats[2], notsortpaforjoin = nsfpaj, givenrange = grng, threads = threads)
+        starts, idx, last_valid_range =  _sortperm_v(dsr, oncols_right, stable = stable, a = alg, mapformats = mapformats[2], notsortpaforjoin = nsfpaj_in, givenrange = grng, threads = threads)
     else
-        starts, idx, last_valid_range =  _sortperm(dsr, oncols_right, stable = stable, a = alg, mapformats = mapformats[2], notsortpaforjoin = nsfpaj, givenrange = grng, threads = threads)
+        starts, idx, last_valid_range =  _sortperm(dsr, oncols_right, stable = stable, a = alg, mapformats = mapformats[2], notsortpaforjoin = nsfpaj_in, givenrange = grng, threads = threads)
     end
 end
 
@@ -423,7 +438,8 @@ function _mark_lt_part!(inbits, x_l, x_r, _fl::F1, _fr::F2, ranges, r_perms, en,
     our_cumsum!(revised_ends)
 end
 
-function _change_refpool_find_range_for_join!(ranges, dsl, dsr, r_perms, oncols_left, oncols_right, lmf, rmf, j; type = :both, nsfpaj = true, threads = true)
+function _change_refpool_find_range_for_join!(ranges, dsl, dsr, r_perms, oncols_left, oncols_right, lmf, rmf, j; type = :both, nsfpaj=nsfpaj, threads = true)
+    nsfpaj_in = nsfpaj[1]
     var_l = _columns(dsl)[oncols_left[j]]
     var_r = _columns(dsr)[oncols_right[j]]
     l_idx = oncols_left[j]
@@ -441,7 +457,7 @@ function _change_refpool_find_range_for_join!(ranges, dsl, dsr, r_perms, oncols_
 
     T1 = Core.Compiler.return_type(DataAPI.unwrap∘_fl, Tuple{eltype(var_l)})
 
-    if DataAPI.refpool(var_r) !== nothing && nsfpaj
+    if DataAPI.refpool(var_r) !== nothing && nsfpaj_in
         # sort taken care for refs ordering of modified values, but we still need to change refs
         if _fr == identity
             var_r_cpy = var_r
@@ -463,6 +479,7 @@ end
 
 function _join_left(dsl, dsr, ::Val{T}; onleft, onright, makeunique = false, mapformats = [true, true], stable = false, alg = HeapSort, check = true, accelerate = false, method = :sort, threads = true, multiple_match::Bool = false, multiple_match_name = :multiple, obs_id = [false, false], obs_id_name = :obs_id) where T
     isempty(dsl) && return copy(dsl)
+    nsfpaj = [true]
     if method == :hash
         ranges, a, idx, minval, reps, sz, right_cols = _find_ranges_for_join_using_hash(dsl, dsr, onleft, onright, mapformats, makeunique, Val(T); threads = threads)
     elseif method == :sort
@@ -480,10 +497,10 @@ function _join_left(dsl, dsr, ::Val{T}; onleft, onright, makeunique = false, map
                 return result
             end
         end
-        idx, uniquemode = _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate; threads = threads)
+        idx, uniquemode = _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate;nsfpaj = nsfpaj, threads = threads)
 
         for j in 1:length(oncols_left)
-            _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], j; threads = threads)
+            _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], j; nsfpaj = nsfpaj, threads = threads)
         end
     end
     new_ends = map(x -> max(1, length(x)), ranges)
@@ -553,6 +570,7 @@ end
 
 function _join_left!(dsl::Dataset, dsr::AbstractDataset, ::Val{T}; onleft, onright, makeunique = false, mapformats = [true, true], stable = false, alg = HeapSort, check = true, accelerate = false, method = :sort, threads = true, multiple_match = false, multiple_match_name = :multiple, obs_id = [false, false], obs_id_name = :obs_id) where T
     isempty(dsl) && return dsl
+    nsfpaj = [true]
     if method == :hash
         ranges, a, idx, minval, reps, sz, right_cols = _find_ranges_for_join_using_hash(dsl, dsr, onleft, onright, mapformats, makeunique, Val(T); threads = threads)
     elseif method == :sort
@@ -569,9 +587,9 @@ function _join_left!(dsl::Dataset, dsr::AbstractDataset, ::Val{T}; onleft, onrig
                 return result
             end
         end
-        idx, uniquemode = _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate, threads = threads)
+        idx, uniquemode = _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate, nsfpaj = nsfpaj, threads = threads)
         for j in 1:length(oncols_left)
-            _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], j, threads = threads)
+            _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], j, nsfpaj = nsfpaj, threads = threads)
         end
     end
     if !all(x->length(x) <= 1, ranges)
@@ -660,11 +678,11 @@ function _join_inner(dsl, dsr::AbstractDataset, ::Val{T}; onleft, onright, onrig
         throw(ArgumentError("duplicate column names, pass `makeunique = true` to make them unique using a suffix automatically." ))
     end
 
-    nsfpaj = true
+    nsfpaj = [true]
     # if the columns for inequality like join are PA we cannot use the fast path
     if type != :both
         if any(i-> DataAPI.refpool(_columns(dsr)[i]) !== nothing, right_range_cols)
-            nsfpaj = false
+            nsfpaj = [false]
         end
     end
     # if (onright_range === nothing || length(onleft) > 1) is false, then we have inequality kind join with no exact match join
@@ -689,7 +707,7 @@ function _join_inner(dsl, dsr::AbstractDataset, ::Val{T}; onleft, onright, onrig
                 return result
             end
         end
-        idx, uniquemode = _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate && (onright_range == nothing || length(oncols_right)>1); nsfpaj = nsfpaj, threads = threads)
+        idx, uniquemode = _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate && (onright_range === nothing || length(oncols_right)>1); nsfpaj = nsfpaj, threads = threads)
 
         for j in 1:length(oncols_left)-1
             _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], j; nsfpaj = nsfpaj, threads = threads)
@@ -784,7 +802,7 @@ function _in(dsl::AbstractDataset, dsr::AbstractDataset, ::Val{T}; onleft, onrig
     isempty(dsl) && return Bool[]
     oncols_left = onleft
     oncols_right = onright
-
+    nsfpaj = [true]
     # use Set when there is only one column in `on`
     if length(oncols_right) == 1
         if mapformats[1]
@@ -800,9 +818,9 @@ function _in(dsl::AbstractDataset, dsr::AbstractDataset, ::Val{T}; onleft, onrig
         return _in_use_Set(_columns(dsl)[oncols_left[1]], _columns(dsr)[oncols_right[1]], _fl, _fr, threads = threads)
     end
     ranges = Vector{UnitRange{T}}(undef, nrow(dsl))
-    idx, uniquemode = _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate, threads = threads)
+    idx, uniquemode = _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate, nsfpaj = nsfpaj, threads = threads)
     for j in 1:length(oncols_left)
-        _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], j, threads = threads)
+        _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], j, nsfpaj = nsfpaj, threads = threads)
     end
     map(x -> length(x) == 0 ? false : true, ranges)
 end
@@ -875,6 +893,7 @@ function _join_outer(dsl, dsr::AbstractDataset, ::Val{T}; onleft, onright, makeu
     (isempty(dsl) || isempty(dsr)) && throw(ArgumentError("in `outerjoin` both left and right tables must be non-empty"))
     oncols_left = onleft
     oncols_right = onright
+    nsfpaj = [true]
     if method == :hash
         ranges, a, idx, minval, reps, sz, right_cols = _find_ranges_for_join_using_hash(dsl, dsr, onleft, onright, mapformats, makeunique, Val(T); threads = threads)
     elseif method == :sort
@@ -889,9 +908,9 @@ function _join_outer(dsl, dsr::AbstractDataset, ::Val{T}; onleft, onright, makeu
                 return result
             end
         end
-        idx, uniquemode = _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate, threads = threads)
+        idx, uniquemode = _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate, nsfpaj = nsfpaj, threads = threads)
         for j in 1:length(oncols_left)
-            _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], j, threads = threads)
+            _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], j, nsfpaj = nsfpaj, threads = threads)
         end
     end
     new_ends = map(x -> max(1, length(x)), ranges)
diff --git a/src/join/join_dict.jl b/src/join/join_dict.jl
index e6a1901b..eb1f7826 100644
--- a/src/join/join_dict.jl
+++ b/src/join/join_dict.jl
@@ -60,7 +60,7 @@ function _create_dictionary_for_join(f, v, fl, vl, ::Val{T}) where T
         maxval = hp_maximum(DataAPI.refarray(v))
         rangelen = maxval - minval + 1
         _create_dictionary_for_join_int(identity, DataAPI.refarray(v), minval, rangelen, Val(T))
-    elseif nonmissingtype(return_type(f, v)) <: AbstractVector{<:Union{Missing, INTEGERS}} && nonmissingtype(return_type(fl, vl)) <: AbstractVector{<:Union{Missing, INTEGERS}}
+    elseif our_nonmissingtype(return_type(f, v)) <: AbstractVector{<:Union{Missing, INTEGERS}} && our_nonmissingtype(return_type(fl, vl)) <: AbstractVector{<:Union{Missing, INTEGERS}}
         minval = hp_minimum(f, v)
         # if minval is missing all values are missing
         if ismissing(minval)
@@ -531,8 +531,8 @@ function _update!_dict(dsl, dsr, ranges, onleft, onright, right_cols, ::Val{T};
     for j in 1:length(right_cols)
         if haskey(index(dsl).lookup, _names(dsr)[right_cols[j]])
             left_cols_idx = index(dsl)[_names(dsr)[right_cols[j]]]
-            TL = nonmissingtype(eltype(_columns(dsl)[left_cols_idx]))
-            TR = nonmissingtype(eltype(_columns(dsr)[right_cols[j]]))
+            TL = our_nonmissingtype(eltype(_columns(dsl)[left_cols_idx]))
+            TR = our_nonmissingtype(eltype(_columns(dsr)[right_cols[j]]))
             if promote_type(TR, TL) <: TL
                 _update_left_with_right!(_columns(dsl)[left_cols_idx], _columns(dsr)[right_cols[j]], ranges, allowmissing, f_mode, threads = threads, op = op)
             end
diff --git a/src/join/update.jl b/src/join/update.jl
index dfd155fc..355e04d9 100644
--- a/src/join/update.jl
+++ b/src/join/update.jl
@@ -28,6 +28,7 @@ end
 
 function _update!(dsl::Dataset, dsr::AbstractDataset, ::Val{T}; onleft, onright, check = true, allowmissing = true, mode = :all, mapformats = [true, true], stable = false, alg = HeapSort, accelerate = false, usehash = true, method = :sort, threads = true, op = nothing) where T
     isempty(dsl) && return dsl
+    nsfpaj = [true]
     if method == :hash
         ranges, a, idx, minval, reps, sz, right_cols = _find_ranges_for_join_using_hash(dsl, dsr, onleft, onright, mapformats, true, Val(T); threads = threads)
     elseif method == :sort
@@ -42,10 +43,10 @@ function _update!(dsl::Dataset, dsr::AbstractDataset, ::Val{T}; onleft, onright,
                 return result
             end
         end
-        idx, uniquemode = _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate, threads = threads)
+        idx, uniquemode = _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate, nsfpaj=nsfpaj, threads = threads)
 
         for j in 1:length(oncols_left)
-            _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], j, threads = threads)
+            _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], j, nsfpaj = nsfpaj, threads = threads)
         end
     end
 
@@ -59,8 +60,8 @@ function _update!(dsl::Dataset, dsr::AbstractDataset, ::Val{T}; onleft, onright,
     for j in 1:length(right_cols)
         if haskey(index(dsl).lookup, _names(dsr)[right_cols[j]])
             left_cols_idx = index(dsl)[_names(dsr)[right_cols[j]]]
-            TL = nonmissingtype(eltype(_columns(dsl)[left_cols_idx]))
-            TR = nonmissingtype(eltype(_columns(dsr)[right_cols[j]]))
+            TL = our_nonmissingtype(eltype(_columns(dsl)[left_cols_idx]))
+            TR = our_nonmissingtype(eltype(_columns(dsr)[right_cols[j]]))
             if promote_type(TR, TL) <: TL
                 _update_left_with_right!(_columns(dsl)[left_cols_idx], view(_columns(dsr)[right_cols[j]], idx), ranges, allowmissing, f_mode, threads = threads, op = op)
             end
diff --git a/src/other/broadcasting.jl b/src/other/broadcasting.jl
index b91b281a..958f57fc 100644
--- a/src/other/broadcasting.jl
+++ b/src/other/broadcasting.jl
@@ -226,7 +226,7 @@ function Base.Broadcast.broadcast_unalias(dest, src::AbstractDataset)
             if src isa SubDataset
                 if !wascopied
                     src = SubDataset(_our_copy(parent(src), copycols=false),
-                                       index(src), rows(src))
+                                       index(src), rows(src), _get_lastmodified(_attributes(parent(src))))
                 end
                 parentidx = parentcols(index(src), i)
                 parent(src)[!, parentidx] = Base.unaliascopy(_columns(parent(src))[parentidx])
@@ -254,7 +254,7 @@ function _broadcast_unalias_helper(dest::AbstractDataset, scol::AbstractVector,
             if src isa SubDataset
                 if !wascopied
                     src =SubDataset(_our_copy(parent(src), copycols=false),
-                                      index(src), rows(src))
+                                      index(src), rows(src), _get_lastmodified(_attributes(parent(src))))
                 end
                 parentidx = parentcols(index(src), col2)
                 parent(src)[!, parentidx] = Base.unaliascopy(_columns(parent(src))[parentidx])
diff --git a/src/other/utils.jl b/src/other/utils.jl
index c6306412..fc8d14df 100644
--- a/src/other/utils.jl
+++ b/src/other/utils.jl
@@ -1,6 +1,16 @@
 const INTEGERS = Union{Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64, Bool}
 const FLOATS = Union{Float16, Float32, Float64}
 
+function our_nonmissingtype(x)
+    T = nonmissingtype(x)
+    if T === Union{}
+        Missing
+    else
+        T
+    end
+end
+
+
 # work around slow allocation of type union in julia
 function _our_vect_alloc(T, len)
 	if len > 0
@@ -34,7 +44,7 @@ function return_type(f::Function, x)
     if eltype(x) <: AbstractVector
         return return_type_tuple(f, x)
     end
-    CT = nonmissingtype(eltype(x))
+    CT = our_nonmissingtype(eltype(x))
     T = Core.Compiler.return_type(f, Tuple{Vector{CT}})
     # workaround for SubArray type
     if T <: SubArray
@@ -50,7 +60,7 @@ function return_type(f::Function, x)
 end
 
 function return_type_tuple(f::Function, x)
-    CT = ntuple(i -> nonmissingtype(eltype(x[i])), length(x))
+    CT = ntuple(i -> our_nonmissingtype(eltype(x[i])), length(x))
     T = Core.Compiler.return_type(f, Tuple{ntuple(i->Vector{CT[i]}, length(x))...})
     # workaround for SubArray type
     if T <: SubArray
@@ -420,7 +430,7 @@ function _gather_groups(ds, cols, ::Val{T}; mapformats = false, stable = true, t
     _max_level = nrow(ds)
 
 
-	if nrow(ds) > 2^23 && !stable && 5<length(colidx)<16 # the result is stable anyway
+	if nrow(ds) > 2^23 && !stable && 5<length(colidx)<16 
 		if !mapformats || all(==(identity), getformat.(Ref(ds), colidx))
 			return _gather_groups_hugeds_multicols(ds, cols, Val(T); threads = threads)
 		end
@@ -454,7 +464,7 @@ function _gather_groups(ds, cols, ::Val{T}; mapformats = false, stable = true, t
         else
             v = _columns(ds)[colidx[j]]
         end
-        if nonmissingtype(Core.Compiler.return_type(_f, Tuple{nonmissingtype(eltype(v))})) <: Union{Missing, INTEGERS}
+        if our_nonmissingtype(Core.Compiler.return_type(_f, Tuple{our_nonmissingtype(eltype(v))})) <: Union{Missing, INTEGERS}
 			if threads
 				_minval = hp_minimum(_f, v)
 			else
@@ -544,24 +554,83 @@ isequal_row(cols1::Tuple{Vararg{AbstractVector}}, r1::Int,
 
 
 _grabrefs(x) = DataAPI.refpool(x) == nothing ? x : DataAPI.refarray(x)
-function _gather_groups_hugeds_multicols(ds, cols, ::Val{T}; threads = true) where T
+function _gather_groups_hugeds_multicols(ds, cols, ::Val{T}; threads::Bool = true) where T
 	colidx = index(ds)[cols]
 	rhashes = byrow(ds, hash, cols, threads = threads)
 	colsvals = ntuple(i->_grabrefs(_columns(ds)[colidx[i]]), length(colidx))
-	create_dict_hugeds_multicols(colsvals, rhashes, Val(T))
+    if threads
+        rngs, sz = _gather_groups_hugeds_splitter(rhashes, Val(T))
+        groups = Vector{T}(undef, length(rhashes))
+        ngroups_all = _gather_groups_hugeds_collector(groups, rngs, sz, rhashes, colsvals, Val(T))
+        ngroups = _gather_groups_hugeds_cleanup!(groups, ngroups_all, rngs, sz)
+    else
+        groups = Vector{T}(undef, length(rhashes))
+        rng = 1:length(rhashes)
+        ngroups = create_dict_hugeds_multicols!(groups, rng, colsvals, rhashes, Val(T))
+    end
+    groups, T[], ngroups
+end
+
+# TODO what happen if the values are not randomly grouped based on cols
+function _gather_groups_hugeds_splitter(rhashes, ::Val{T}) where T
+    nt = 997 # TODO this should be an argument, however, we must be careful that this value doesn't degrade actual dictionary creation in Subsequent steps
+    sz = zeros(T, nt)
+    # It is safe to record _ids - memory will be released and it does not add extra memory to the total amount (we later need to allocate groups)
+    _id = Vector{Int16}(undef, length(rhashes))
+    for i in eachindex(rhashes)
+        _id[i] = (rhashes[i] % nt)+1
+        sz[_id[i]] += 1
+    end
+    rngs = Vector{T}(undef, length(rhashes))
+    prepend!(sz, T(0))
+    our_cumsum!(sz)
+    sz_cp = copy(sz)
+   
+    for i in eachindex(rhashes)
+        idx=_id[i]
+        sz_cp[idx] += 1
+        rngs[sz_cp[idx]] = i
+    end
+    rngs, sz
+end
+
+function _gather_groups_hugeds_collector(groups, rngs, sz, rhashes, colsvals, ::Val{T}) where T
+    ngroups = Vector{Int}(undef, length(sz)-1)
+    Threads.@threads for i in 2:length(sz)
+        hi = sz[i]
+        lo = sz[i-1]+1
+        _tmp = view(groups, view(rngs, lo:hi))
+        ngroups[i-1] = create_dict_hugeds_multicols!(_tmp, view(rngs, lo:hi), colsvals, rhashes, Val(T))
+    end
+    ngroups
+end
+
+function _gather_groups_hugeds_cleanup!(groups, ngroups, rngs, sz)
+    our_cumsum!(ngroups)
+    Threads.@threads for i in 3:length(sz)
+        hi=sz[i]
+        lo=sz[i-1]+1
+        for j in lo:hi
+            groups[rngs[j]] += ngroups[i-2]
+        end
+    end
+    return ngroups[end]
 end
 
-function create_dict_hugeds_multicols(colvals, rhashes, ::Val{T}) where T
-	sz = max(1 + ((5 * length(rhashes)) >> 2), 16)
+# groups is a list of integeres for which the dict is going to be created
+# get index and set index should sometimes be adjusted based on rng
+# make sure groups is a vector{T}
+function create_dict_hugeds_multicols!(groups, rng, colvals, rhashes, ::Val{T}) where T
+    isempty(rng) && return 0
+	sz = max(1 + ((5 * length(groups)) >> 2), 16)
     sz = 1 << (8 * sizeof(sz) - leading_zeros(sz - 1))
-    @assert 4 * sz >= 5 * length(rhashes)
+    @assert 4 * sz >= 5 * length(groups)
     szm1 = sz-1
     gslots = zeros(T, sz)
-	groups = Vector{T}(undef, length(rhashes))
     ngroups = 0
-    @inbounds for i in eachindex(rhashes)
+    @inbounds for i in eachindex(rng)
         # find the slot and group index for a row
-        slotix = rhashes[i] & szm1 + 1
+        slotix = rhashes[rng[i]] & szm1 + 1
         gix = -1
         probe = 0
         while true
@@ -570,8 +639,8 @@ function create_dict_hugeds_multicols(colvals, rhashes, ::Val{T}) where T
                 gslots[slotix] = i
                 gix = ngroups += 1
                 break
-            elseif rhashes[i] == rhashes[g_row] # occupied slot, check if miss or hit
-                if isequal_row(colvals, i, Int(g_row)) # hit
+            elseif rhashes[rng[i]] == rhashes[rng[g_row]] # occupied slot, check if miss or hit
+                if isequal_row(colvals, Int(rng[i]), Int(rng[g_row])) # hit
                     gix = groups[g_row]
                     break
                 end
@@ -580,9 +649,10 @@ function create_dict_hugeds_multicols(colvals, rhashes, ::Val{T}) where T
             probe += 1
             @assert probe < sz
         end
+        # groups[i] has done its work we can modify it 
         groups[i] = gix
     end
-    return groups, gslots, ngroups
+    return ngroups
 end
 
 
diff --git a/src/precompile/precompile.jl b/src/precompile/precompile.jl
index e1f03ce5..51f62c76 100644
--- a/src/precompile/precompile.jl
+++ b/src/precompile/precompile.jl
@@ -546,6 +546,6 @@ function _precompile()
         Base.precompile(Tuple{Core.kwftype(typeof(transpose)),NamedTuple{(:id, :threads), Tuple{Symbol, Bool}},typeof(transpose),Dataset,Vector{Symbol}})
         Base.precompile(Tuple{Core.kwftype(typeof(transpose)),NamedTuple{(:threads,), Tuple{Bool}},typeof(transpose),Dataset,UnitRange{Int64}})
         Base.precompile(Tuple{Core.kwftype(typeof(transpose)),NamedTuple{(:threads,), Tuple{Bool}},typeof(transpose),GroupBy,Vector{Int64}})
-
+        VERSION >= v"1.9" && IMD.warmup()
     return nothing
 end
\ No newline at end of file
diff --git a/src/precompile/warmup.jl b/src/precompile/warmup.jl
index 10982d68..99f8e1a6 100644
--- a/src/precompile/warmup.jl
+++ b/src/precompile/warmup.jl
@@ -173,7 +173,8 @@ function warmup()
     findall(duplicates(ds, :a, mapformats = true)) == 2:12
     unique(ds) == ds1
     unique(ds, 2:3) == ds1
-
+    ds = Dataset(x=[rand(10) for _ in 1:100])
+    flatten!(ds, 1)
     t2 = now()
     Dataset(x1 = "Finished warmup in", x2 = t2-t1)
 end
diff --git a/src/sort/gatherby.jl b/src/sort/gatherby.jl
index 46259219..c914b740 100644
--- a/src/sort/gatherby.jl
+++ b/src/sort/gatherby.jl
@@ -115,8 +115,6 @@ function compute_indices(groups, ngroups, ::Val{T}; threads = true) where T
 	idx, starts
 end
 
-# fast combine for gatherby data
-
 mutable struct GatherBy
     parent
     groupcols
@@ -125,8 +123,12 @@ mutable struct GatherBy
     mapformats::Bool
     perm
     starts
+	created::DateTime
+end
+function Base.copy(gds::GatherBy)
+	ds_cpy = copy(gds.parent)
+	GatherBy(copy(gds.parent), copy(gds.groupcols), copy(gds.groups), gds.lastvalid, gds.mapformats, gds.perm === nothing ? nothing : copy(gds.perm), gds.starts === nothing ? nothing : copy(gds.starts), _get_lastmodified(_attributes(ds_cpy)))
 end
-Base.copy(gds::GatherBy) = GatherBy(copy(gds.parent), copy(gds.groupcols), copy(gds.groups), gds.lastvalid, gds.mapformats, gds.perm === nothing ? nothing : copy(gds.perm), gds.starts === nothing ? nothing : copy(gds.starts))
 
 
 nrow(ds::GatherBy) = nrow(ds.parent)
@@ -148,6 +150,7 @@ Base.summary(gds::GatherBy) =
 function Base.show(io::IO, gds::GatherBy;
 
 	kwargs...)
+	_check_consistency(gds)
 	if length(_get_perms(gds)) > 200
 		_show(io, view(gds.parent, [first(gds.perm, 100);last(gds.perm, 100)], :); title = summary(gds), show_omitted_cell_summary=false, show_row_number  = false, kwargs...)
 	else
@@ -176,57 +179,43 @@ end
 function gatherby(ds::AbstractDataset, cols::MultiColumnIndex; mapformats::Bool = true, stable::Bool = true, isgathered::Bool = false, eachrow::Bool = false, threads = true)
     colsidx = index(ds)[cols]
 	if isempty(ds)
-		return GatherBy(ds, colsidx, Int[], 0, mapformats, nothing, nothing)
+		return GatherBy(ds, colsidx, Int[], 0, mapformats, nothing, nothing, _get_lastmodified(_attributes(ds)))
 	end
 
 	T = nrow(ds) < typemax(Int32) ? Int32 : Int64
 	_check_consistency(ds)
 	if isgathered
 		if eachrow
-			return GatherBy(ds, colsidx, 1:nrow(ds), nrow(ds), mapformats, 1:nrow(ds), 1:nrow(ds))
+			return GatherBy(ds, colsidx, 1:nrow(ds), nrow(ds), mapformats, 1:nrow(ds), 1:nrow(ds), _get_lastmodified(_attributes(ds)))
 		else
 			colindex, ranges, last_valid_index = _find_starts_of_groups(ds, colsidx, Val(T); mapformats = mapformats, threads = threads)
 		 	groups = Vector{T}(undef, nrow(ds))
 		 	_group_creator!(groups, ranges, last_valid_index)
-		 	return GatherBy(ds, colindex, groups, last_valid_index, mapformats, 1:nrow(ds), ranges)
+		 	return GatherBy(ds, colindex, groups, last_valid_index, mapformats, 1:nrow(ds), ranges, _get_lastmodified(_attributes(ds)))
 		end
 	else
 		if eachrow
 			a = _gather_groups(ds, colsidx, Val(T), mapformats = mapformats, stable = stable, threads = threads)
 			b = compute_indices(a[1], a[3], nrow(ds) < typemax(Int32) ? Val(Int32) : Val(Int64); threads = threads)
-			return GatherBy(ds, colsidx, 1:nrow(ds), nrow(ds), mapformats, b[1], 1:nrow(ds))
+			return GatherBy(ds, colsidx, 1:nrow(ds), nrow(ds), mapformats, b[1], 1:nrow(ds), _get_lastmodified(_attributes(ds)))
 		else
 			a = _gather_groups(ds, colsidx, Val(T), mapformats = mapformats, stable = stable, threads = threads)
-			return GatherBy(ds, colsidx, a[1], a[3], mapformats, nothing, nothing)
+			return GatherBy(ds, colsidx, a[1], a[3], mapformats, nothing, nothing, _get_lastmodified(_attributes(ds)))
 		end
 	end
 end
 gatherby(ds::AbstractDataset, col::ColumnIndex; mapformats = true, stable = true, isgathered = false, eachrow = false, threads = true) = gatherby(ds, [col], mapformats = mapformats, stable = stable, isgathered = isgathered, eachrow = eachrow, threads = threads)
 
-
-__SPFRMT(x) = x & 1023
-__SPFRMT(::Missing) = missing # not needed
-
-# currently not been used in gatherby
-# use sort and format trick for fast gatherby - hm stands for high memory footprint
-function hm_gatherby(ds::AbstractDataset, cols::MultiColumnIndex; mapformats = false, threads = true)
-	modify!(ds, cols=>byrow(hash; threads = threads, mapformats = mapformats)=>:___tmp___cols8934, :___tmp___cols8934=>identity=>:___tmp___cols8934_2)
-	setformat!(ds, :___tmp___cols8934_2=>__SPFRMT)
-	gds = groupby(ds, [:___tmp___cols8934_2, :___tmp___cols8934], stable = false, threads = threads)
-	grpcols, ranges, last_valid_index = _find_starts_of_groups(view(ds, gds.perm, cols), cols, nrow(ds) < typemax(Int32) ? Val(Int32) : Val(Int64); mapformats = mapformats, threads = threads)
-	select!(ds, Not([:___tmp___cols8934, :___tmp___cols8934_2]))
-	GatherBy(ds, grpcols, nothing, last_valid_index, mapformats, gds.perm, ranges)
-end
-
 function _fill_mapreduce_col!(x, f, op, y, loc)
     @inbounds for i in 1:length(y)
         x[loc[i]] = op(x[loc[i]], f(y[i]))
     end
 end
 
-function _fill_mapreduce_col!(x, f::Vector, op, y, loc)
+# only for calculating var - mval is a vector of means
+function _fill_mapreduce_col!(x, mval::AbstractVector, op, y, loc)
 	@inbounds for i in 1:length(y)
-        x[loc[i]] = op(x[loc[i]], f[loc[i]](y[i]))
+        x[loc[i]] = op(x[loc[i]], _abs2mean(y[i], mval[loc[i]]))
     end
 end
 
@@ -242,11 +231,12 @@ function _fill_mapreduce_col_threaded!(x, f, op, y, loc, nt)
     end
 end
 
-function _fill_mapreduce_col_threaded!(x, f::Vector, op, y, loc, nt)
+# only for calculating var - mval is a vector of means
+function _fill_mapreduce_col_threaded!(x, mval::AbstractVector, op, y, loc, nt)
 	@sync for thid in 0:nt-1
 		Threads.@spawn for i in 1:length(y)
         	@inbounds if loc[i] % nt == thid
-				x[loc[i]] = op(x[loc[i]], f[loc[i]](y[i]))
+				x[loc[i]] = op(x[loc[i]], _abs2mean(y[i], mval[loc[i]]))
 			end
 		end
     end
@@ -258,8 +248,8 @@ end
 function gatherby_mapreduce(gds::GatherBy, f, op, col::ColumnIndex, nt, init, ::Val{T}; promotetypes = false, threads = true) where T
 	CT = T
 	if promotetypes
-	    T <: Base.SmallSigned ? CT = Int : nothing
-		T <: Base.SmallUnsigned ? CT = UInt : nothing
+	    T <: SMALLSIGNED ? CT = Int : nothing
+		T <: SMALLUNSIGNED ? CT = UInt : nothing
 	end
 	res = allocatecol(Union{CT, Missing}, gds.lastvalid)
     fill!(res, init)
@@ -271,8 +261,8 @@ function gatherby_mapreduce(gds::GatherBy, f, op, col::ColumnIndex, nt, init, ::
     res
 end
 
-_gatherby_maximum(gds, col; f = identity, nt = Threads.nthreads(), threads = true) = gatherby_mapreduce(gds, f, _stat_max_fun, col, nt, missing, Val(nonmissingtype(eltype(gds.parent[!, col]))), threads = threads)
-_gatherby_minimum(gds, col; f = identity, nt = Threads.nthreads(), threads = true) = gatherby_mapreduce(gds, f, _stat_min_fun, col, nt, missing, Val(nonmissingtype(eltype(gds.parent[!, col]))), threads = threads)
+_gatherby_maximum(gds, col; f = identity, nt = Threads.nthreads(), threads = true) = gatherby_mapreduce(gds, f, _stat_max_fun, col, nt, missing, Val(our_nonmissingtype(eltype(gds.parent[!, col]))), threads = threads)
+_gatherby_minimum(gds, col; f = identity, nt = Threads.nthreads(), threads = true) = gatherby_mapreduce(gds, f, _stat_min_fun, col, nt, missing, Val(our_nonmissingtype(eltype(gds.parent[!, col]))), threads = threads)
 _gatherby_sum(gds, col; f = identity, nt = Threads.nthreads(), threads = true) = gatherby_mapreduce(gds, f, _stat_add_sum, col, nt, missing, Val(typeof(zero(Core.Compiler.return_type(f, Tuple{eltype(gds.parent[!, col])})))), promotetypes = true, threads = threads)
 _gatherby_n(gds, col; nt = Threads.nthreads(), threads = true) = _gatherby_sum(gds, col, f = _stat_notmissing, nt = nt, threads = threads)
 _gatherby_length(gds, col; nt = Threads.nthreads(), threads = true) = _gatherby_sum(gds, col, f = x->1, nt = nt, threads = threads)
@@ -306,7 +296,7 @@ function _gatherby_mean(gds, col; nt = Threads.nthreads(), threads = true)
 		nval = t2
 	end
 
-	T = Core.Compiler.return_type(/, Tuple{nonmissingtype(eltype(sval)), nonmissingtype(eltype(nval))})
+	T = Core.Compiler.return_type(/, Tuple{our_nonmissingtype(eltype(sval)), our_nonmissingtype(eltype(nval))})
 	res = _our_vect_alloc(Union{Missing, T}, length(nval))
 	_fill_gatherby_mean_barrier!(res, sval, nval)
 	res
@@ -340,6 +330,7 @@ function _fill_gatherby_var_barrier!(res, countnan, meanval, ss, nval, cal_std,
 end
 
 # TODO directly calculating var should be a better approach
+_abs2mean(x, meanval) = abs2(x - meanval)
 function _gatherby_var(gds, col; dof = true, cal_std = false, threads = true)
 	if threads
 		nt = Threads.nthreads()
@@ -347,7 +338,7 @@ function _gatherby_var(gds, col; dof = true, cal_std = false, threads = true)
 		t1 = Threads.@spawn _gatherby_cntnan(gds, col, nt = nt2)
 		t2 = Threads.@spawn _gatherby_mean(gds, col, nt = nt2)
 		meanval = fetch(t2)
-		t3 = Threads.@spawn gatherby_mapreduce(gds, [x->abs2(x - meanval[i]) for i in 1:length(meanval)], _stat_add_sum, col, nt2, missing, Val(Float64))
+		t3 = Threads.@spawn gatherby_mapreduce(gds, meanval, _stat_add_sum, col, nt2, missing, Val(Float64))
 		t4 = Threads.@spawn _gatherby_n(gds, col, nt = nt2)
 		countnan = fetch(t1)
 		ss = fetch(t3)
@@ -356,13 +347,13 @@ function _gatherby_var(gds, col; dof = true, cal_std = false, threads = true)
 		t1 = _gatherby_cntnan(gds, col, threads = threads)
 		t2 = _gatherby_mean(gds, col, threads = threads)
 		meanval = t2
-		t3 = gatherby_mapreduce(gds, [x->abs2(x - meanval[i]) for i in 1:length(meanval)], _stat_add_sum, col, Threads.nthreads(), missing, Val(Float64), threads = threads)
+		t3 = gatherby_mapreduce(gds, meanval, _stat_add_sum, col, Threads.nthreads(), missing, Val(Float64), threads = threads)
 		t4 = _gatherby_n(gds, col, threads = threads)
 		countnan = t1
 		ss = t3
 		nval = t4
 	end
-	T = Core.Compiler.return_type(/, Tuple{nonmissingtype(eltype(meanval)), nonmissingtype(eltype(nval))})
+	T = Core.Compiler.return_type(/, Tuple{our_nonmissingtype(eltype(meanval)), our_nonmissingtype(eltype(nval))})
 	res = _our_vect_alloc(Union{Missing, T}, length(nval))
 	_fill_gatherby_var_barrier!(res, countnan, meanval, ss, nval, cal_std, dof)
 	res
@@ -375,7 +366,7 @@ const FAST_GATHERBY_REDUCTION = [sum, length, minimum, maximum, mean, var, std,
 
 function _fast_gatherby_reduction(gds, ms)
     !(gds isa GatherBy) && return false
-    gds.groups == nothing && return false
+    gds.groups === nothing && return false
     for i in 1:length(ms)
         if (ms[i].second.first isa Expr) && ms[i].second.first.head == :BYROW
         elseif (ms[i].second.first isa Base.Callable)
diff --git a/src/sort/groupby.jl b/src/sort/groupby.jl
index 1d80dc59..5ee85abb 100644
--- a/src/sort/groupby.jl
+++ b/src/sort/groupby.jl
@@ -12,7 +12,7 @@ Return a `GroupBy` representing a view of a `sorted` data set which each group o
 - `ds` : an `AbstractDataset` or the output of `groupby`.
 - `cols` : data set columns to group by. Can be any column selector
   ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). 
-- `alg` : The sorting algorithm for creating `grouped` data set. By default Heap algorithm is used, however, user can pass `Quicksort` too.
+- `alg` : The sorting algorithm for creating `grouped` data set. By default Heap algorithm is used, however, user can pass `QuickSort` too.
 - `rev` : A `Bool` value or a Vector of `Bool` which indicate which column should be sorted in descending order.
 - `mapforamts`: Whether the formated values should be used or not.
 - `stable`: Whether the sorting alogrithm should be stable or not. Setting this to `false` often improve the performance.
@@ -97,7 +97,7 @@ Repace a data set by its sorted version and tag the data set as a grouped data s
 - `ds` : a `Dataset`.
 - `cols` : data set columns to group by. Can be any column selector
   ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). 
-- `alg` : The sorting algorithm for creating `grouped` data set. By default Heap algorithm is used, however, user can pass `Quicksort` too.
+- `alg` : The sorting algorithm for creating `grouped` data set. By default Heap algorithm is used, however, user can pass `QuickSort` too.
 - `rev` : A `Bool` value or a Vector of `Bool` which indicate which column should be sorted in descending order.
 - `mapforamts`: Whether the formated values should be used or not.
 - `stable`: Whether the sorting alogrithm should be stable or not. Setting this to `false` often improve the performance.
@@ -167,9 +167,13 @@ mutable struct GroupBy
 	starts
 	lastvalid
 	mapformats::Bool
+	created::DateTime
 end
 
-Base.copy(gds::GroupBy) = GroupBy(copy(gds.parent), copy(gds.groupcols), copy(gds.rev), copy(gds.perm), copy(gds.starts), gds.lastvalid, gds.mapformats)
+function Base.copy(gds::GroupBy)
+	ds_cp = copy(gds.parent)
+	GroupBy(ds_cp, copy(gds.groupcols), copy(gds.rev), copy(gds.perm), copy(gds.starts), gds.lastvalid, gds.mapformats, _get_lastmodified(_attributes(ds_cp)))
+end
 
 nrow(ds::GroupBy) = nrow(ds.parent)
 ncol(ds::GroupBy) = ncol(ds.parent)
@@ -185,10 +189,10 @@ function groupby(ds::Dataset, cols::MultiColumnIndex; alg = HeapSortAlg(), rev =
 	_check_consistency(ds)
 	colsidx = index(ds)[cols]
 	if isempty(ds)
-		return GroupBy(parent(ds), colsidx, rev, Int[], Int[], 0, mapformats)
+		return GroupBy(parent(ds), colsidx, rev, Int[], Int[], 0, mapformats, _get_lastmodified(_attributes(ds)))
 	end
 	a = _sortperm(ds, cols, rev, a = alg, mapformats = mapformats, stable = stable, threads = threads)
-	GroupBy(parent(ds),colsidx, rev, a[2], a[1], a[3], mapformats)
+	GroupBy(parent(ds),colsidx, rev, a[2], a[1], a[3], mapformats, _get_lastmodified(_attributes(ds)))
 end
 
 groupby(ds::Dataset, col::ColumnIndex; alg = HeapSortAlg(), rev = false, mapformats::Bool = true, stable = true, threads = true) = groupby(ds, [col], alg = alg, rev = rev, mapformats = mapformats, stable = stable, threads = threads)
@@ -209,7 +213,7 @@ function groupby(ds::GroupBy, cols::MultiColumnIndex; alg = HeapSortAlg(), rev =
 	colsidx = index(ds)[cols]
 	grng = GIVENRANGE(copy(_get_perms(ds)),copy(_group_starts(ds)), nothing, _ngroups(ds))
 	a = _sortperm(ds, cols, rev, a = alg, mapformats = mapformats, stable = stable, givenrange = grng, skipcol = -1, threads = threads)
-	GroupBy(parent(ds),colsidx, rev, a[2], a[1], a[3], mapformats)
+	GroupBy(parent(ds),colsidx, rev, a[2], a[1], a[3], mapformats, _get_lastmodified(_attributes(parent(ds))))
 end
 groupby(ds::GroupBy, col::ColumnIndex; alg = HeapSortAlg(), rev = false, mapformats::Bool = true, stable = true, threads = true) = groupby(ds, [col], alg = alg, rev = rev, mapformats = mapformats, stable = stable, threads = threads)
 
@@ -233,6 +237,7 @@ end
 
 modify(origninal_gds::Union{GroupBy, GatherBy}, @nospecialize(args...); threads::Bool = true) = modify!(copy(origninal_gds), args..., threads = threads)
 function modify!(gds::Union{GroupBy, GatherBy}, @nospecialize(args...); threads::Bool = true)
+	_check_consistency(gds)
 	if parent(gds) isa SubDataset
 		idx_cpy = copy(index(parent(gds)))
 	else
@@ -315,6 +320,7 @@ end
 
 
 function combine(gds::Union{GroupBy, GatherBy}, @nospecialize(args...); dropgroupcols = false, threads = true)
+	_check_consistency(gds)
 	idx_cpy::Index = Index(Dict{Symbol, Int}(), Symbol[], Dict{Int, Function}())
 	if !dropgroupcols
         for i in gds.groupcols
@@ -441,6 +447,7 @@ Base.summary(gds::GroupBy) =
 function Base.show(io::IO, gds::GroupBy;
 
 	kwargs...)
+	_check_consistency(gds)
 	#TODO pretty_table is very slow for large views, temporary workaround, later we should fix this
 	if length(gds.perm) > 200
 		_show(io, view(gds.parent, [first(gds.perm, 100);last(gds.perm, 100)], :); title = summary(gds), show_omitted_cell_summary=false, show_row_number  = false, kwargs...)
@@ -589,6 +596,15 @@ function groupby(ds::SubDataset, cols::MultiColumnIndex; alg = HeapSortAlg(), re
 	_check_consistency(ds)
 	colsidx = index(ds)[cols]
 	a = _sortperm_v(ds, cols, rev, a = alg, mapformats = mapformats, stable = stable, threads = threads)
-	GroupBy(ds, colsidx, rev, a[2], a[1], a[3], mapformats)
+	GroupBy(ds, colsidx, rev, a[2], a[1], a[3], mapformats, _get_lastmodified(_attributes(ds)))
 end
 groupby(ds::SubDataset, col::ColumnIndex; alg = HeapSortAlg(), rev = false, mapformats::Bool = true, stable = true, threads = true) = groupby(ds, [col], alg = alg, rev = rev, mapformats = mapformats, stable = stable, threads = threads)
+
+
+### check consistency of grouped data - GroupBy, GatherBy
+
+function _check_consistency(ds::Union{GroupBy, GatherBy})
+    lmd=ds.created
+    lmp=_get_lastmodified(_attributes(parent(ds)))
+    @assert lmd == lmp "The parent data set which the grouped data set is based on has been modified. To fix the issue regroup data."
+end
\ No newline at end of file
diff --git a/src/sort/int.jl b/src/sort/int.jl
index d8617f76..681ca1b6 100644
--- a/src/sort/int.jl
+++ b/src/sort/int.jl
@@ -98,7 +98,7 @@ end
 function _sort_chunks_int_right!(x, idx::Vector{<:Integer}, idx_cpy, where, number_of_chunks, rangelen, minval, o::Ordering)
     cz = div(length(x), number_of_chunks)
     en = length(x)
-    Threads.@threads for i in 1:number_of_chunks
+    Threads.@threads :static for i in 1:number_of_chunks
         ds_sort_int_missatright!(x, idx, idx_cpy, where[Threads.threadid()], (i-1)*cz+1,i*cz, rangelen, minval)
     end
     # take care of the last few observations
@@ -111,7 +111,7 @@ end
 function _sort_chunks_int_left!(x, idx::Vector{<:Integer}, idx_cpy, where, number_of_chunks, rangelen, minval, o::Ordering)
     cz = div(length(x), number_of_chunks)
     en = length(x)
-    Threads.@threads for i in 1:number_of_chunks
+    Threads.@threads :static for i in 1:number_of_chunks
         ds_sort_int_missatleft!(x, idx, idx_cpy, where[Threads.threadid()], (i-1)*cz+1,i*cz, rangelen, minval)
     end
     # take care of the last few observations
@@ -262,7 +262,7 @@ function _ds_sort_int_missatright_nopermx_threaded!(x, original_P, copy_P, lo, h
         where[i][1] = 1
         where[i][2] = 1
     end
-    Threads.@threads for i = lo:hi
+    Threads.@threads :static for i = lo:hi
         @inbounds ismissing(x[i]) ? where[Threads.threadid()][rangelen+3] += 1 : where[Threads.threadid()][Int(x[i]) + offs + 2] += 1
     end
     for j in 3:length(where[1])
@@ -306,7 +306,7 @@ function _ds_sort_int_missatright_nopermx_threaded!(x, original_P, rangelen, min
         where[i][1] = 1
         where[i][2] = 1
     end
-    Threads.@threads for i = 1:length(x)
+    Threads.@threads :static for i = 1:length(x)
         @inbounds ismissing(x[i]) ? where[Threads.threadid()][rangelen+3] += 1 : where[Threads.threadid()][Int(x[i]) + offs + 2] += 1
     end
     for j in 3:length(where[1])
@@ -348,7 +348,7 @@ function _ds_sort_int_missatleft_nopermx_threaded!(x, original_P, copy_P, lo, hi
         where[i][1] = 1
         where[i][2] = 1
     end
-    Threads.@threads for i = lo:hi
+    Threads.@threads :static for i = lo:hi
         @inbounds ismissing(x[i]) ? where[Threads.threadid()][3] += 1 : where[Threads.threadid()][Int(x[i]) + offs + 3] += 1
     end
     for j in 3:length(where[1])
@@ -392,7 +392,7 @@ function _ds_sort_int_missatleft_nopermx_threaded!(x, original_P, rangelen, minv
         where[i][1] = 1
         where[i][2] = 1
     end
-    Threads.@threads for i = 1:length(x)
+    Threads.@threads :static for i = 1:length(x)
         @inbounds ismissing(x[i]) ? where[Threads.threadid()][3] += 1 : where[Threads.threadid()][Int(x[i]) + offs + 3] += 1
     end
     for j in 3:length(where[1])
diff --git a/src/sort/sort.jl b/src/sort/sort.jl
index 82f14bd3..7e67766c 100644
--- a/src/sort/sort.jl
+++ b/src/sort/sort.jl
@@ -213,11 +213,21 @@ end
 
 function _issorted_check_for_each_range(v, starts, lastvalid, _ord, nrows; threads = true)
     part_res = ones(Bool, threads ? Threads.nthreads() : 1)
-    @_threadsfor threads for rng in 1:lastvalid
-        lo = starts[rng]
-        rng == lastvalid ? hi = nrows : hi = starts[rng+1] - 1
-        part_res[Threads.threadid()] = _issorted_barrier(v, _ord, lo, hi)
-        !part_res[Threads.threadid()] &&  break
+    if threads
+
+        Threads.@threads :static for rng in 1:lastvalid
+            lo = starts[rng]
+            rng == lastvalid ? hi = nrows : hi = starts[rng+1] - 1
+            part_res[Threads.threadid()] = _issorted_barrier(v, _ord, lo, hi)
+            !part_res[Threads.threadid()] &&  break
+        end
+    else
+        for rng in 1:lastvalid
+            lo = starts[rng]
+            rng == lastvalid ? hi = nrows : hi = starts[rng+1] - 1
+            part_res[Threads.threadid()] = _issorted_barrier(v, _ord, lo, hi)
+            !part_res[Threads.threadid()] &&  break
+        end
     end
     all(part_res)
 end
diff --git a/src/sort/sortperm.jl b/src/sort/sortperm.jl
index 9fc77631..cd6e2d84 100644
--- a/src/sort/sortperm.jl
+++ b/src/sort/sortperm.jl
@@ -29,7 +29,7 @@ end
 # we should find starts here
 function fast_sortperm_int_threaded!(x, original_P, copy_P, ranges, rangelen, minval, misatleft, last_valid_range, ::Val{T}) where T
     starts = [T[] for i in 1:Threads.nthreads()]
-    Threads.@threads for i in 1:last_valid_range
+    Threads.@threads :static for i in 1:last_valid_range
         rangestart = ranges[i]
         i == last_valid_range ? rangeend = length(x) : rangeend = ranges[i+1] - 1
         # if (rangeend - rangestart) == 0
@@ -45,6 +45,8 @@ function fast_sortperm_int_threaded!(x, original_P, copy_P, ranges, rangelen, mi
     end
     cnt = 1
     flag = false
+    #Threads@threads now does not keep the order of the runs, we help starts be sorted before shaping ranges
+    sort!(starts, by=x->isempty(x) ? missing : x[1])
     @inbounds for i in 1:length(starts)
         for j in 1:length(starts[i])
             ranges[cnt] = starts[i][j]
@@ -103,29 +105,57 @@ function fast_sortperm_int!(x, original_P, copy_P, ranges, rangelen, minval, mis
 end
 
 function _sortperm_int!(idx, idx_cpy, x, ranges, where, last_valid_range, missingatleft, ord, a; threads = true)
-    @_threadsfor threads for i in 1:last_valid_range
-        rangestart = ranges[i]
-        i == last_valid_range ? rangeend = length(x) : rangeend = ranges[i+1] - 1
-        if (rangeend - rangestart + 1) == 1
-            continue
-        end
-        _minval = stat_minimum(x, lo = rangestart, hi = rangeend)
-        if ismissing(_minval)
-            continue
-        else
-            minval::Int = _minval
+    if threads
+        Threads.@threads :static for i in 1:last_valid_range
+            rangestart = ranges[i]
+            i == last_valid_range ? rangeend = length(x) : rangeend = ranges[i+1] - 1
+            if (rangeend - rangestart + 1) == 1
+                continue
+            end
+            _minval = stat_minimum(x, lo = rangestart, hi = rangeend)
+            if ismissing(_minval)
+                continue
+            else
+                minval::Int = _minval
+            end
+            maxval::Int = stat_maximum(x, lo = rangestart, hi = rangeend)
+            # the overflow is check before calling _sortperm_int!
+            rangelen = maxval - minval + 1
+            if rangelen < div(rangeend - rangestart + 1, 2)
+                if missingatleft
+                    ds_sort_int_missatleft!(x, idx, idx_cpy, where[Threads.threadid()], rangestart, rangeend, rangelen, minval)
+                else
+                    ds_sort_int_missatright!(x, idx, idx_cpy, where[Threads.threadid()], rangestart, rangeend, rangelen, minval)
+                end
+            else
+                ds_sort!(x, idx, rangestart, rangeend, a, ord)
+            end
         end
-        maxval::Int = stat_maximum(x, lo = rangestart, hi = rangeend)
-        # the overflow is check before calling _sortperm_int!
-        rangelen = maxval - minval + 1
-        if rangelen < div(rangeend - rangestart + 1, 2)
-            if missingatleft
-                ds_sort_int_missatleft!(x, idx, idx_cpy, where[Threads.threadid()], rangestart, rangeend, rangelen, minval)
+    else
+        for i in 1:last_valid_range
+            rangestart = ranges[i]
+            i == last_valid_range ? rangeend = length(x) : rangeend = ranges[i+1] - 1
+            if (rangeend - rangestart + 1) == 1
+                continue
+            end
+            _minval = stat_minimum(x, lo = rangestart, hi = rangeend)
+            if ismissing(_minval)
+                continue
             else
-                ds_sort_int_missatright!(x, idx, idx_cpy, where[Threads.threadid()], rangestart, rangeend, rangelen, minval)
+                minval::Int = _minval
+            end
+            maxval::Int = stat_maximum(x, lo = rangestart, hi = rangeend)
+            # the overflow is check before calling _sortperm_int!
+            rangelen = maxval - minval + 1
+            if rangelen < div(rangeend - rangestart + 1, 2)
+                if missingatleft
+                    ds_sort_int_missatleft!(x, idx, idx_cpy, where[Threads.threadid()], rangestart, rangeend, rangelen, minval)
+                else
+                    ds_sort_int_missatright!(x, idx, idx_cpy, where[Threads.threadid()], rangestart, rangeend, rangelen, minval)
+                end
+            else
+                ds_sort!(x, idx, rangestart, rangeend, a, ord)
             end
-        else
-            ds_sort!(x, idx, rangestart, rangeend, a, ord)
         end
     end
 end
@@ -133,7 +163,7 @@ end
 function _apply_by_f_barrier(x::AbstractVector{T}, by, rev, threads) where T
     needrev = rev
     missat = :right
-    CT = Core.Compiler.return_type(_date_value∘by, Tuple{nonmissingtype(T)})
+    CT = Core.Compiler.return_type(_date_value∘by, Tuple{our_nonmissingtype(T)})
     if CT == Bool
         CT = Int8
     end
@@ -141,7 +171,7 @@ function _apply_by_f_barrier(x::AbstractVector{T}, by, rev, threads) where T
     # _temp = Vector{CT}(undef, length(x))
     _temp = _our_vect_alloc(CT, length(x))
     # we should make sure changing sign doesn't overflow
-    if rev && nonmissingtype(CT) <: Union{Bool, Int8, Int16, Int32, Int64} && isless(typemin(nonmissingtype(CT)), threads ? hp_minimum(_date_value∘by, x) : stat_minimum(_date_value∘by, x))
+    if rev && our_nonmissingtype(CT) <: Union{Bool, Int8, Int16, Int32, Int64} && isless(typemin(our_nonmissingtype(CT)), threads ? hp_minimum(_date_value∘by, x) : stat_minimum(_date_value∘by, x))
         _by = x-> -_date_value(by(x))
         needrev = false
         missat = :left
diff --git a/src/stat/hp_stat.jl b/src/stat/hp_stat.jl
index 5a5766da..ed5d6aa3 100644
--- a/src/stat/hp_stat.jl
+++ b/src/stat/hp_stat.jl
@@ -3,7 +3,7 @@ function hp_maximum(f, x::AbstractVector{T}) where {T}
     nt = Threads.nthreads()
     cz = div(n, nt)
     cz == 0 && return stat_maximum(f, x)
-    CT = Core.Compiler.return_type(f, Tuple{nonmissingtype(eltype(x))})
+    CT = Core.Compiler.return_type(f, Tuple{our_nonmissingtype(eltype(x))})
     if T >: Missing
         CT = Union{Missing,CT}
     end
@@ -22,7 +22,7 @@ function hp_minimum(f, x::AbstractVector{T}) where {T}
     nt = Threads.nthreads()
     cz = div(n, nt)
     cz == 0 && return stat_minimum(f, x)
-    CT = Core.Compiler.return_type(f, Tuple{nonmissingtype(eltype(x))})
+    CT = Core.Compiler.return_type(f, Tuple{our_nonmissingtype(eltype(x))})
     if T >: Missing
         CT = Union{Missing,CT}
     end
@@ -41,9 +41,9 @@ function hp_sum(f, x::AbstractVector{T}) where {T}
     nt = Threads.nthreads()
     cz = div(n, nt)
     cz == 0 && return stat_sum(f, x)
-    CT = Core.Compiler.return_type(f, Tuple{nonmissingtype(eltype(x))})
-    CT <: Base.SmallSigned ? CT = Int : nothing
-    CT <: Base.SmallUnsigned ? CT = UInt : nothing
+    CT = Core.Compiler.return_type(f, Tuple{our_nonmissingtype(eltype(x))})
+    CT <: SMALLSIGNED ? CT = Int : nothing
+    CT <: SMALLUNSIGNED ? CT = UInt : nothing
     CT <: Bool ? CT = Int : nothing
     if T >: Missing
         CT = Union{Missing,CT}
diff --git a/src/stat/non_hp_stat.jl b/src/stat/non_hp_stat.jl
index 8ad5c8f9..d44e3d71 100644
--- a/src/stat/non_hp_stat.jl
+++ b/src/stat/non_hp_stat.jl
@@ -119,7 +119,7 @@ function rescale(x, minx, maxx, minval, maxval)
     -(-maxx * minval + minx * maxval) / (maxx - minx) + (-minval + maxval) * x / (maxx - minx)
 end
 rescale(::Missing, minx, maxx, minval, maxval) = missing
-rescale(x::Vector, minx, maxx, minval, maxval) = rescale.(x, minx, maxx, minval, maxval)
+rescale(x::AbstractVector, minx, maxx, minval, maxval) = rescale.(x, minx, maxx, minval, maxval)
 rescale(x, minx, maxx) = rescale(x, minx, maxx, 0.0, 1.0)
 
 """
@@ -137,11 +137,10 @@ end
 # this is manual simd version for max(min) function
 function stat_maximum(f::typeof(identity), x::AbstractArray{T,1}; lo=1, hi=length(x)) where {T}
     all(ismissing, view(x, lo:hi)) && return missing
-    _dmiss(x) = ismissing(x) ? typemin(nonmissingtype(T)) : x
+    _dmiss(x) = ismissing(x) ? typemin(our_nonmissingtype(T)) : x
     Base.mapreduce_impl(_dmiss, max, x, lo, hi)
 end
 function stat_maximum(f::F, x::AbstractArray{T,1}; lo=1, hi=length(x)) where {F,T}
-    all(ismissing, view(x, lo:hi)) && return missing
     Base.mapreduce_impl(f, _stat_max_fun, x, lo, hi)
 end
 stat_maximum(x::AbstractArray{T,1}; lo=1, hi=length(x)) where {T} = stat_maximum(identity, x; lo=lo, hi=hi)
@@ -162,11 +161,10 @@ stat_findmax(x::AbstractArray{T,1}) where {T} = stat_findmax(identity, x)
 
 function stat_minimum(f::typeof(identity), x::AbstractArray{T,1}; lo=1, hi=length(x)) where {T}
     all(ismissing, view(x, lo:hi)) && return missing
-    @inline _dmiss(x) = ismissing(x) ? typemax(nonmissingtype(T)) : x
+    @inline _dmiss(x) = ismissing(x) ? typemax(our_nonmissingtype(T)) : x
     Base.mapreduce_impl(_dmiss, min, x, lo, hi)
 end
 function stat_minimum(f::F, x::AbstractArray{T,1}; lo=1, hi=length(x)) where {F,T}
-    all(ismissing, view(x, lo:hi)) && return missing
     Base.mapreduce_impl(f, _stat_min_fun, x, lo, hi)
 end
 stat_minimum(x::AbstractArray{T,1}; lo=1, hi=length(x)) where {T} = stat_minimum(identity, x; lo=lo, hi=hi)
@@ -180,9 +178,7 @@ stat_findmin(x::AbstractArray{T,1}) where {T} = stat_findmin(identity, x)
 
 
 function stat_sum(f, x::AbstractArray{T,1}; lo=1, hi=length(x)) where {T<:Union{Missing,INTEGERS,FLOATS}}
-    all(ismissing, view(x, lo:hi)) && return f(first(x))
-    _dmiss(y) = ifelse(ismissing(f(y)), zero(T), f(y))
-    Base.mapreduce_impl(_dmiss, _stat_add_sum, x, lo, hi)
+    Base.mapreduce_impl(f, _stat_add_sum, x, lo, hi)
 end
 stat_sum(x::AbstractArray{T,1}; lo=1, hi=length(x)) where {T<:Union{Missing,INTEGERS,FLOATS}} = stat_sum(identity, x; lo=lo, hi=hi)
 
@@ -297,19 +293,21 @@ function stat_wmean(f, x::AbstractVector{T}, w::AbstractArray{S,1}) where {T} wh
 end
 stat_wmean(x::AbstractVector{T}, w::AbstractArray{S,1}) where {T} where {S} = stat_wmean(identity, x, w)
 
-
+_abs2_var_barrier(x,y,f::F) where F = abs2(f(x)-y)
+_meanval_var_barrier(n, sval)::Union{Missing, Float64} = n == 0 ? missing : sval / n
 function stat_var(f, x::AbstractArray{T,1}, dof=true)::Union{Float64,Missing} where {T<:Union{Missing,INTEGERS,FLOATS}}
-    all(ismissing, x) && return missing
+    # all(ismissing, x) && return missing
     # any(ISNAN, x) && return convert(eltype(x), NaN)
     # meanval = stat_mean(f, x)
     # n = mapreduce(!ismissing∘f, +, x)
     sval = stat_sum(y -> f(y) * 1.0, x)
     n = mapreduce(!ismissing ∘ f, +, x)
-    meanval = n == 0 ? missing : sval / n
+    meanval = _meanval_var_barrier(n, sval)
 
     ss = 0.0
     for i in 1:length(x)
-        ss = _stat_add_sum(ss, abs2(f(x[i]) - meanval))
+        # ss = _stat_add_sum(ss, abs2(f(x[i]) - meanval))
+        ss = _stat_add_sum(ss, _abs2_var_barrier(x[i], meanval, f))
     end
 
     if n == 0
@@ -331,7 +329,7 @@ stat_std(x::AbstractArray{T,1}, dof=true) where {T} = stat_std(identity, x, dof)
 function stat_median(v::AbstractArray{T,1}) where {T}
     isempty(v) && throw(ArgumentError("median of an empty array is undefined, $(repr(v))"))
     all(ismissing, v) && return missing
-    (nonmissingtype(eltype(v)) <: AbstractFloat || nonmissingtype(eltype(v)) >: AbstractFloat) && any(ISNAN, v) && return convert(eltype(v), NaN)
+    (our_nonmissingtype(eltype(v)) <: AbstractFloat || our_nonmissingtype(eltype(v)) >: AbstractFloat) && any(ISNAN, v) && return convert(eltype(v), NaN)
     nmis::Int = mapreduce(ismissing, +, v)
     n = length(v) - nmis
     mid = div(1 + n, 2)
@@ -343,10 +341,11 @@ function stat_median(v::AbstractArray{T,1}) where {T}
     end
 end
 
+# TODO in julia1.9+ partialsort! allocates, and it is not a good idea if we need to call stat_median! many times
 function stat_median!(v::AbstractArray{T,1}) where {T}
     isempty(v) && throw(ArgumentError("median of an empty array is undefined, $(repr(v))"))
     all(ismissing, v) && return missing
-    (nonmissingtype(eltype(v)) <: AbstractFloat || nonmissingtype(eltype(v)) >: AbstractFloat) && any(ISNAN, v) && return convert(eltype(v), NaN)
+    (our_nonmissingtype(eltype(v)) <: AbstractFloat || our_nonmissingtype(eltype(v)) >: AbstractFloat) && any(ISNAN, v) && return convert(eltype(v), NaN)
     nmis::Int = mapreduce(ismissing, +, v)
     n = length(v) - nmis
     mid = div(1 + n, 2)
diff --git a/src/stat/stat.jl b/src/stat/stat.jl
index fe7d64be..d3d5ecbf 100644
--- a/src/stat/stat.jl
+++ b/src/stat/stat.jl
@@ -8,10 +8,16 @@ minimum(f, x; threads = false) = Base.minimum(f, x)
 minimum(x::AbstractArray{Union{Missing, T},1}; threads = false) where T <: Union{INTEGERS, FLOATS, TimeType}= isempty(x) ? throw(ArgumentError("empty arrays are not allowed")) : threads ? hp_minimum(identity, x) : stat_minimum(identity, x)
 minimum(x; threads = false) = Base.minimum(x)
 # TODO not optimised for simd - threads option is useless here / it is here because we have it for other types of data
-maximum(f, x::AbstractVector{Union{Missing, T}}; threads = false) where T <: AbstractString = mapreduce(f, _stat_max_fun, x)
-minimum(f, x::AbstractVector{Union{Missing, T}}; threads = false) where T <: AbstractString = mapreduce(f, _stat_min_fun, x)
-maximum(x::AbstractVector{Union{Missing, T}}; threads = false) where T <: AbstractString = maximum(identity, x)
-minimum(x::AbstractVector{Union{Missing, T}}; threads = false) where T <: AbstractString = minimum(identity, x)
+# using Union{Missing, AbstractString} force to fall back to this definition for Vector{Missing} Julia >= 1.9
+if VERSION >= v"1.9"
+    _TASM_14329 = Union{Missing, AbstractString}
+else
+    _TASM_14329 = AbstractString
+end
+maximum(f, x::AbstractVector{Union{Missing, T}}; threads = false) where T <: _TASM_14329 = mapreduce(f, _stat_max_fun, x)
+minimum(f, x::AbstractVector{Union{Missing, T}}; threads = false) where T <: _TASM_14329 = mapreduce(f, _stat_min_fun, x)
+maximum(x::AbstractVector{Union{Missing, T}}; threads = false) where T <: _TASM_14329 = maximum(identity, x)
+minimum(x::AbstractVector{Union{Missing, T}}; threads = false) where T <: _TASM_14329 = minimum(identity, x)
 
 sum(f, x::AbstractArray{Union{Missing, T},1}; threads = false) where T <: Union{INTEGERS, FLOATS} = isempty(x) ? throw(ArgumentError("empty arrays are not allowed")) : threads ? hp_sum(f, x) : stat_sum(f, x)
 sum(f, x; threads = false)=Base.sum(f, x)
diff --git a/src/subdataset/subdataset.jl b/src/subdataset/subdataset.jl
index cc1f5091..19acb161 100644
--- a/src/subdataset/subdataset.jl
+++ b/src/subdataset/subdataset.jl
@@ -1,5 +1,5 @@
 """
-    SubDataset{<:Dataset, <:AbstractIndex, <:AbstractVector{Int}} <: Dataset
+    SubDataset{<:Dataset, <:AbstractIndex, <:AbstractVector{Int}, DateTime} <: Dataset
 
 A view of a `Dataset`. It is returned by a call to the `view` function
 on an `Dataset` if a collections of rows and columns are specified.
@@ -90,6 +90,7 @@ struct SubDataset{D<:AbstractDataset, S<:AbstractIndex, T<:AbstractVector{Int}}
     parent::D
     colindex::S
     rows::T # maps from subds row indexes to parent row indexes
+    created::DateTime
 end
 
 _attributes(sds::SubDataset) = getfield(parent(sds), :attributes)
@@ -111,11 +112,25 @@ Base.@propagate_inbounds function SubDataset(parent::Dataset, rows::AbstractVect
     sindex = SubIndex(index(parent), cols)
     # SubDataset without columns should not have any row
     if all(==(0), sindex.remap)
-        SubDataset(parent, sindex, Int[])
+        SubDataset(parent, sindex, Int[], _get_lastmodified(_attributes(parent)))
     else
-        SubDataset(parent,sindex , rows)
+        SubDataset(parent,sindex , rows, _get_lastmodified(_attributes(parent)))
     end
 end
+
+Base.@propagate_inbounds function SubDataset(parent::Dataset, rows::AbstractVector{Int}, cols, created)
+    @boundscheck if !checkindex(Bool, axes(parent, 1), rows)
+        throw(BoundsError(parent, (rows, cols)))
+    end
+    sindex = SubIndex(index(parent), cols)
+    # SubDataset without columns should not have any row
+    if all(==(0), sindex.remap)
+        SubDataset(parent, sindex, Int[], created)
+    else
+        SubDataset(parent,sindex , rows, created)
+    end
+end
+
 Base.@propagate_inbounds SubDataset(parent::Dataset, ::Colon, cols) =
     SubDataset(parent, axes(parent, 1), cols)
 @inline SubDataset(parent::Dataset, row::Integer, cols) =
@@ -144,7 +159,7 @@ Base.@propagate_inbounds function SubDataset(parent::Dataset, rows::AbstractVect
 end
 
 Base.@propagate_inbounds SubDataset(sds::SubDataset, rowind, cols) =
-    SubDataset(parent(sds), rows(sds)[rowind], parentcols(index(sds), cols))
+    SubDataset(parent(sds), rows(sds)[rowind], parentcols(index(sds), cols), getfield(sds, :created))
 Base.@propagate_inbounds SubDataset(sds::SubDataset, rowind::Bool, cols) =
     throw(ArgumentError("invalid row index of type Bool"))
 
@@ -158,7 +173,7 @@ Base.@propagate_inbounds SubDataset(sds::SubDataset, rowind::Bool, cols) =
 Base.@propagate_inbounds SubDataset(sds::SubDataset, rowind::Bool, ::Colon) =
     throw(ArgumentError("invalid row index of type Bool"))
 Base.@propagate_inbounds SubDataset(sds::SubDataset, ::Colon, cols) =
-    SubDataset(parent(sds), rows(sds), parentcols(index(sds), cols))
+    SubDataset(parent(sds), rows(sds), parentcols(index(sds), cols), getfield(sds, :created))
 @inline SubDataset(sds::SubDataset, ::Colon, ::Colon) = sds
 
 # just for showing SubDataset
@@ -202,15 +217,15 @@ Base.@propagate_inbounds Base.view(ads::AbstractDataset, ::typeof(!), colind::Co
 
 @inline Base.view(ads::AbstractDataset, rowinds, colind::Bool) =
     throw(ArgumentError("invalid column index $colind of type `Bool`"))
-Base.@propagate_inbounds Base.view(ads::AbstractDataset, rowinds,
+Base.@propagate_inbounds Base.view(parent::AbstractDataset, rowinds,
                                    colinds::MultiColumnIndex) =
-    SubDataset(ads, rowinds, colinds)
-Base.@propagate_inbounds Base.view(ads::AbstractDataset, rowinds::typeof(!),
+    SubDataset(parent, rowinds, colinds)
+Base.@propagate_inbounds Base.view(parent::AbstractDataset, rowinds::typeof(!),
                                    colinds::MultiColumnIndex) =
-    SubDataset(ads, :, colinds)
-Base.@propagate_inbounds Base.view(ads::AbstractDataset, rowinds::Not,
+    SubDataset(parent, :, colinds)
+Base.@propagate_inbounds Base.view(parent::AbstractDataset, rowinds::Not,
                                    colinds::MultiColumnIndex) =
-    SubDataset(ads, axes(ads, 1)[rowinds], colinds)
+    SubDataset(parent, axes(parent, 1)[rowinds], colinds)
 
 ##############################################################################
 ##
diff --git a/test/broadcasting.jl b/test/broadcasting.jl
index 461c725e..b4a63005 100644
--- a/test/broadcasting.jl
+++ b/test/broadcasting.jl
@@ -133,7 +133,7 @@ end
     end
     ds4 = (x -> ds[1, 1]).(ds)
     @test names(ds4) == names(ds)
-    @test all(isa.(eachcol(ds4), Ref(CategoricalArray)))
+    @test all(isa.(eachcol(ds4), DatasetColumn{Dataset, CategoricalVector{Union{Missing, String}, UInt32, String, CategoricalValue{String, UInt32}, Missing}}))
     @test all(eachcol(ds4) .== Ref(categorical(["a", "a"])))
 
     ds5 = Dataset(x=Any[1, 2, 3], y=Any[1, 2.0, big(3)])
diff --git a/test/byrow.jl b/test/byrow.jl
index 379a3070..ef4fc19a 100644
--- a/test/byrow.jl
+++ b/test/byrow.jl
@@ -414,4 +414,12 @@ end
     @test byrow(ds, fun123, (1,2,3)) == [1,-1.0,-9,2.5]
     fun123_2(x,y) = x == 1 && y < 0 ? true : false
     @test byrow(ds, fun123_2, (:x1, :x2)) == [false, false, true, false]
+end
+
+@testset "byrow - nunique" begin
+    ds = Dataset(x=2.1, y=4611911198408756429, z=missing, k=-2.1)
+    @test byrow(ds, nunique, :)[1] == 4
+    @test byrow(ds, nunique, :, count_missing = false)[1] == 3
+    @test byrow(ds, nunique, :, by = abs)[1] == 3
+    @test byrow(ds, nunique, :, by = abs, count_missing=false)[1] == 2
 end
\ No newline at end of file
diff --git a/test/constructors.jl b/test/constructors.jl
index 231a036c..a5d60f74 100644
--- a/test/constructors.jl
+++ b/test/constructors.jl
@@ -318,13 +318,13 @@ end
 
 @testset "column types" begin
     ds = Dataset(A = 1:3, B = 2:4, C = 3:5)
-    answer = [Array{Union{Missing, Int}, 1}, Array{Union{Missing, Int}, 1}, Array{Union{Missing, Int}, 1}]
+    answer = [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}]
     @test typeof.(eachcol(ds)) == answer
     ds[!, :D] = [4, 5, missing]
-    push!(answer, Vector{Union{Int, Missing}})
+    push!(answer, DatasetColumn{Dataset, Vector{Union{Missing, Int64}}})
     @test typeof.(eachcol(ds)) == answer
     ds[!, :E] .= 'c'
-    push!(answer, Vector{Union{Missing, Char}})
+    push!(answer, DatasetColumn{Dataset, Vector{Union{Missing, Char}}})
     @test typeof.(eachcol(ds)) == answer
 end
 
diff --git a/test/join.jl b/test/join.jl
index 443c3879..f99aff6b 100644
--- a/test/join.jl
+++ b/test/join.jl
@@ -93,18 +93,25 @@ closefinance_tol10ms_noexact = Dataset([Union{Missing, DateTime}[DateTime("2016-
     @test closejoin(trades, quotes, on = :time, makeunique = true) == closefinance1 == closejoin(trades, quotes, on = :time, makeunique = true, method = :hash)
 
     @test innerjoin(name, job, on = :ID) == inner == innerjoin(name, job, on = :ID, threads = false)
+    @test innerjoin(name, sort(job, :ID), on = :ID) == inner == innerjoin(name, sort(job, :ID), on = :ID, threads = false) == innerjoin(name, sort(job, :ID, rev=true), on = :ID)
     @test innerjoin(name, job, on = :ID) == inner == innerjoin(name, job, on = :ID, method = :hash, threads = false)
     @test outerjoin(name, job, on = :ID) == outer == outerjoin(name, job, on = :ID, threads = false)
+    @test outerjoin(name, sort(job, :ID), on = :ID) == outer == outerjoin(name, sort(job, :ID), on = :ID, threads = false) == outerjoin(name, sort(job, :ID, rev=true), on = :ID)
     @test outerjoin(name, job, on = :ID) == outer == outerjoin(name, job, on = :ID, method = :hash, threads = false)
     @test leftjoin(name, job, on = :ID) == left == leftjoin(name, job, on = :ID, threads = false)
+    @test leftjoin(name, sort(job, :ID), on = :ID) == left == leftjoin(name, sort(job, :ID), on = :ID, threads = false) == leftjoin(name, sort(job, :ID, rev=true), on = :ID)
     @test leftjoin(name, job, on = :ID) == left == leftjoin(name, job, on = :ID, method = :hash, threads = false)
-    @test semijoin(name, job, on = :ID) == semi == semijoin(name, job, on = :ID, threads = false)
+    @test semijoin(name, job, on = :ID, method=:sort) == semi == semijoin(name, job, on = :ID, threads = false, method=:sort)
+    @test semijoin(name, sort(job, :ID), on = :ID, method=:sort) == semi == semijoin(name, sort(job, :ID), on = :ID, threads = false, method=:sort) == semijoin(name, sort(job, :ID, rev=true), on = :ID, method=:sort)
     @test semijoin(name, job, on = :ID) == semi == semijoin(name, job, on = :ID, method = :hash, threads = false)
-    @test antijoin(name, job, on = :ID) == anti == antijoin(name, job, on = :ID, threads = false)
+    @test antijoin(name, job, on = :ID, method=:sort) == anti == antijoin(name, job, on = :ID, threads = false, method=:sort)
+    @test antijoin(name, sort(job, :ID), on = :ID, method=:sort) == anti == antijoin(name, sort(job, :ID), on = :ID, threads = false, method=:sort) == antijoin(name, sort(job, :ID, rev=true), on = :ID, method=:sort)
     @test antijoin(name, job, on = :ID) == anti == antijoin(name, job, on = :ID, method = :hash, threads = false)
     @test closejoin(classA, grades, on = :mark) == closeone == closejoin(classA, grades, on = :mark, threads = false)
+    @test closejoin(classA, sort(grades, :mark), on = :mark) == closeone == closejoin(classA, sort(grades, :mark), on = :mark, threads = false) == closejoin(classA, sort(grades, :mark, rev=true), on = :mark)
     @test closejoin(classA, grades, on = :mark) == closeone == closejoin(classA, grades, on = :mark, method = :hash, threads = false)
     @test closejoin(trades, quotes, on = :time, makeunique = true) == closefinance1 == closejoin(trades, quotes, on = :time, makeunique = true, threads = false)
+    @test closejoin(trades, sort(quotes, :time), on = :time, makeunique = true) == closefinance1 == closejoin(trades, sort(quotes, :time), on = :time, makeunique = true, threads = false) == closejoin(trades, sort(quotes, :time, rev=true), on = :time, makeunique = true)
     @test closejoin(trades, quotes, on = :time, makeunique = true) == closefinance1 == closejoin(trades, quotes, on = :time, makeunique = true, method = :hash, threads = false)
 
     @test innerjoin(name, view(job, :, :), on = :ID) == inner
@@ -139,13 +146,20 @@ closefinance_tol10ms_noexact = Dataset([Union{Missing, DateTime}[DateTime("2016-
 
 
     @test closejoin(trades, quotes, on =[:ticker, :time], tol = Millisecond(2)) == closfinance_tol2ms
+    @test closejoin(trades, sort(quotes, [:ticker, :time]), on =[:ticker, :time], tol = Millisecond(2)) == closfinance_tol2ms == closejoin(trades, sort(quotes, [:ticker, :time], rev=[true, false]), on =[:ticker, :time], tol = Millisecond(2))
     @test closejoin(trades, quotes, on =[:ticker, :time], tol = Day(2)) == closejoin(trades, quotes, on =[:ticker, :time])
+    @test closejoin(trades, sort(quotes, [:ticker, :time]), on =[:ticker, :time], tol = Day(2)) == closejoin(trades, quotes, on =[:ticker, :time]) == closejoin(trades, sort(quotes, [:ticker, :time], rev=[true, false]), on =[:ticker, :time], tol = Day(2))
     @test closejoin(trades, quotes, on =[:ticker, :time], tol = Millisecond(0)) == closfinance_tol0ms
+    @test closejoin(trades, sort(quotes, [:ticker, :time]), on =[:ticker, :time], tol = Millisecond(0)) == closfinance_tol0ms == closejoin(trades, sort(quotes, [:ticker, :time], rev=[true, false]), on =[:ticker, :time], tol = Millisecond(0))
     @test closejoin(trades, quotes, on = [:ticker, :time], tol = Millisecond(10), allow_exact_match = false) == closefinance_tol10ms_noexact
+    @test closejoin(trades, sort(quotes, [:ticker, :time]), on = [:ticker, :time], tol = Millisecond(10), allow_exact_match = false) == closefinance_tol10ms_noexact == closejoin(trades, sort(quotes, [:ticker, :time], rev=[true, false]), on = [:ticker, :time], tol = Millisecond(10), allow_exact_match = false)
     @test closejoin!(copy(trades), quotes, on =[:ticker, :time], tol = Millisecond(2)) == closfinance_tol2ms
+    @test closejoin!(copy(trades), sort(quotes, [:ticker, :time]), on =[:ticker, :time], tol = Millisecond(2)) == closfinance_tol2ms == closejoin!(copy(trades), sort(quotes, [:ticker, :time], rev=true), on =[:ticker, :time], tol = Millisecond(2))
     @test closejoin!(copy(trades), quotes, on =[:ticker, :time], tol = Day(2)) == closejoin(trades, quotes, on =[:ticker, :time])
     @test closejoin!(copy(trades), quotes, on =[:ticker, :time], tol = Millisecond(0)) == closfinance_tol0ms
     @test closejoin!(copy(trades), quotes, on = [:ticker, :time], tol = Millisecond(10), allow_exact_match = false) == closefinance_tol10ms_noexact
+    @test closejoin!(copy(trades), sort(quotes, [:ticker, :time]), on = [:ticker, :time], tol = Millisecond(10), allow_exact_match = false) == closefinance_tol10ms_noexact == closejoin!(copy(trades), sort(quotes, [:ticker, :time], rev=true), on = [:ticker, :time], tol = Millisecond(10), allow_exact_match = false)
+
 
     @test closejoin(trades, quotes, on =[:ticker, :time], tol = Millisecond(2), method = :hash) == closfinance_tol2ms
     @test closejoin(trades, quotes, on =[:ticker, :time], tol = Day(2), method = :hash) == closejoin(trades, quotes, on =[:ticker, :time])
@@ -261,6 +275,13 @@ closefinance_tol10ms_noexact = Dataset([Union{Missing, DateTime}[DateTime("2016-
     @test leftjoin(nameid, jobid, on = :ID) == left[:, on]
     @test semijoin(nameid, jobid, on = :ID) == semi[:, on]
     @test antijoin(nameid, jobid, on = :ID) == anti[:, on]
+
+    @test innerjoin(nameid, sort(jobid, :ID), on = :ID) == inner[:, on]
+    @test outerjoin(nameid, sort(jobid, :ID), on = :ID) == outer[:, on]
+    @test leftjoin(nameid, sort(jobid, :ID), on = :ID) == left[:, on]
+    @test semijoin(nameid, sort(jobid, :ID), on = :ID) == semi[:, on]
+    @test antijoin(nameid, sort(jobid, :ID), on = :ID) == anti[:, on]
+
     @test innerjoin(nameid, view(jobid, :, :), on = :ID) == inner[:, on]
     @test outerjoin(nameid, view(jobid, :, :), on = :ID) == outer[:, on]
     @test leftjoin(nameid, view(jobid, :, :), on = :ID) == left[:, on]
@@ -821,6 +842,10 @@ closefinance_tol10ms_noexact = Dataset([Union{Missing, DateTime}[DateTime("2016-
         @test left2 == leftjoin(dsl, dsr, on = [:x1, :x2], makeunique = true, accelerate = false, stable = true, check = false, threads = false)
         @test left2 == leftjoin(dsl, dsr, on = [:x1, :x2], makeunique = true, accelerate = false, stable = true, check = false, method = :hash, threads = false)
 
+        @test left2 == leftjoin(dsl, sort(dsr, [:x1, :x2]), on = [:x1, :x2], makeunique = true, accelerate = false, stable = true, check = false, method = :hash)
+        @test left2 == leftjoin(dsl, sort(dsr, [:x1, :x2]), on = [:x1, :x2], makeunique = true, accelerate = false, stable = true, check = false, threads = false)
+        @test left2 == leftjoin(dsl, sort(dsr, [:x1, :x2]), on = [:x1, :x2], makeunique = true, accelerate = false, stable = true, check = false, method = :hash, threads = false)
+
 
         @test left1 == left2
         @test unique(select!(left1, [:x1, :x2, :x3]), [:x1, :x2]) == unique(dsl, [:x1, :x2])
@@ -845,7 +870,10 @@ closefinance_tol10ms_noexact = Dataset([Union{Missing, DateTime}[DateTime("2016-
     semi1 = semijoin(dsl, dsr, on = [:x1, :x2])
     @test semi1 == semijoin(dsl, dsr, on = [:x1, :x2], method = :hash)
     @test semi1 == semijoin(dsl, dsr, on = [:x1, :x2], threads = false)
+    @test semi1 == semijoin(dsl, sort(dsr, [:x1, :x2]), on = [:x1, :x2], threads = false)
     @test semi1 == semijoin(dsl, dsr, on = [:x1, :x2], method = :hash, threads = false)
+    @test semi1 == semijoin(dsl, sort(dsr, [:x1, :x2]), on = [:x1, :x2], method = :hash, threads = false)
+
 
     semi2 = semijoin(dsl, dsr, on = [:x1, :x2], accelerate = true)
     @test semi1 == dsl
@@ -866,12 +894,16 @@ closefinance_tol10ms_noexact = Dataset([Union{Missing, DateTime}[DateTime("2016-
     @test inn1 == innerjoin(dsl, dsr, on =[:x1, :x2], mapformats = [true, false], stable = true, method = :hash)
     @test out1 == outerjoin(dsl, dsr, on =[:x1, :x2], mapformats = [true, false], stable = true, method = :hash)
     @test left1 == leftjoin(dsl, dsr, on =[:x1, :x2], mapformats = [true, false], accelerate = true, stable =true, method = :hash)
+    @test left1 == leftjoin(dsl, sort(dsr, [:x1, :x2]), on =[:x1, :x2], mapformats = [true, false], accelerate = true, stable =true, method = :sort)
+
 
     @test inn1 == innerjoin(dsl, dsr, on =[:x1, :x2], mapformats = [true, false], stable = true, threads = false)
     @test inn1 == innerjoin(dsl, dsr, on =[:x1, :x2], mapformats = [true, false], stable = true, method = :hash, threads = false)
     @test out1 == outerjoin(dsl, dsr, on =[:x1, :x2], mapformats = [true, false], stable = true, threads = false)
     @test out1 == outerjoin(dsl, dsr, on =[:x1, :x2], mapformats = [true, false], stable = true, method = :hash, threads = false)
     @test left1 == leftjoin(dsl, dsr, on =[:x1, :x2], mapformats = [true, false], accelerate = true, stable =true, threads = false)
+    @test left1 == leftjoin(dsl, sort(dsr, [:x1, :x2]), on =[:x1, :x2], mapformats = [true, false], accelerate = true, stable =true, threads = false) == leftjoin(dsl, sort(dsr, [:x1, :x2], rev=true), on =[:x1, :x2], mapformats = [true, false], accelerate = true, stable =true, threads = false)
+
     @test left1 == leftjoin(dsl, dsr, on =[:x1, :x2], mapformats = [true, false], accelerate = true, stable =true, method = :hash, threads = false)
 
     @test inn1 == out1 == left1
@@ -915,7 +947,10 @@ closefinance_tol10ms_noexact = Dataset([Union{Missing, DateTime}[DateTime("2016-
     setformat!(dsl, 1:2=>fmtfun)
     semi1 = semijoin(dsl, dsr, on = [:x1, :x2], mapformats = [true, false])
     @test semi1 == semijoin(dsl, dsr, on = [:x1, :x2], mapformats = [true, false], method =:hash)
-    @test semi1 == semijoin(dsl, dsr, on = [:x1, :x2], mapformats = [true, false], threads = false)
+    @test semi1 == semijoin(dsl, dsr, on = [:x1, :x2], mapformats = [true, false], threads = false, method=:sort)
+    @test semi1 == semijoin(dsl, sort(dsr, [:x1, :x2]), on = [:x1, :x2], mapformats = [true, false], threads = false, method=:sort)
+    @test semi1 == semijoin(dsl, sort(dsr, [:x1, :x2, :y2]), on = [:x1, :x2], mapformats = [true, false], threads = false, method=:sort)
+
     @test semi1 == semijoin(dsl, dsr, on = [:x1, :x2], mapformats = [true, false], method = :hash, threads = false)
 
 
@@ -924,6 +959,8 @@ closefinance_tol10ms_noexact = Dataset([Union{Missing, DateTime}[DateTime("2016-
     @test semi2 == dsl
     inn1 = innerjoin(dsl, dsr, on =[:x1, :x2], mapformats = [true, false], stable = true)
     out1 = outerjoin(dsl, dsr, on =[:x1, :x2], mapformats = [true, false], stable = true)
+    out1 = outerjoin(dsl, sort(dsr, [:x1, :x2]), on =[:x1, :x2], mapformats = [true, false], stable = true)
+
     left1 = leftjoin(dsl, dsr, on =[:x1, :x2], mapformats = [true, false], accelerate = true, stable =true)
 
     @test inn1 == innerjoin(dsl, dsr, on =[:x1, :x2], mapformats = [true, false], stable = true, method = :hash)
@@ -995,20 +1032,52 @@ closefinance_tol10ms_noexact = Dataset([Union{Missing, DateTime}[DateTime("2016-
     @test closejoin(dsl, dsr, on = [:x, :y], makeunique = true, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing])
     @test closejoin(dsl, dsr, on = [:x, :y], makeunique = true, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300])
 
+    @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,100, 300,300,300])
+    @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing])
+    @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300])
+
+    @test closejoin(dsl, sort(dsr, [:x, :y, :z]), on = [:x, :y], makeunique = true) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,100, 300,300,300])
+    @test closejoin(dsl, sort(dsr, [:x, :y, :z]), on = [:x, :y], makeunique = true, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing])
+    @test closejoin(dsl, sort(dsr, [:x, :y, :z]), on = [:x, :y], makeunique = true, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300])
+
     @test closejoin(dsl, dsr, on = [:x, :y], method = :hash,  makeunique = true) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,100, 300,300,300])
     @test closejoin(dsl, dsr, on = [:x, :y], method = :hash,  makeunique = true, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing])
     @test closejoin(dsl, dsr, on = [:x, :y], method = :hash,  makeunique = true, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300])
 
+    @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true, method=:hash) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,100, 300,300,300])
+    @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true, method=:hash, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing])
+    @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true, method=:hash, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300])
+
+    @test closejoin(dsl, sort(dsr, [:x, :y, :z]), on = [:x, :y], makeunique = true, method=:hash) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,100, 300,300,300])
+    @test closejoin(dsl, sort(dsr, [:x, :y, :z]), on = [:x, :y], makeunique = true, method=:hash, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing])
+    @test closejoin(dsl, sort(dsr, [:x, :y, :z]), on = [:x, :y], makeunique = true, method=:hash, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300])
+
+
     dsl = Dataset(x = [1,1,1,2,2,2], y = ([6,4,1,2,5,3]))
     dsr = Dataset(x = [1,1,2], y = PooledArray([0,3,1]), z=[100,200,300])
     @test closejoin(dsl, dsr, on = [:x, :y], makeunique = true) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,100, 300,300,300])
     @test closejoin(dsl, dsr, on = [:x, :y], makeunique = true, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing])
     @test closejoin(dsl, dsr, on = [:x, :y], makeunique = true, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300])
 
+    @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,100, 300,300,300])
+    @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing])
+    @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300])
+    
+    @test closejoin(dsl, sort(dsr, [:x, :y, :z], rev=true), on = [:x, :y], makeunique = true) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,100, 300,300,300])
+    @test closejoin(dsl, sort(dsr, [:x, :y, :z], rev=true), on = [:x, :y], makeunique = true, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing])
+    @test closejoin(dsl, sort(dsr, [:x, :y, :z], rev=true), on = [:x, :y], makeunique = true, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300])
+
     @test closejoin(dsl, dsr, on = [:x, :y], method = :hash, makeunique = true) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,100, 300,300,300])
     @test closejoin(dsl, dsr, on = [:x, :y], method = :hash, makeunique = true, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing])
     @test closejoin(dsl, dsr, on = [:x, :y], method = :hash, makeunique = true, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300])
 
+    @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true, method=:hash) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,100, 300,300,300])
+    @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true, method=:hash, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing])
+    @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true, method=:hash, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300])
+    
+    @test closejoin(dsl, sort(dsr, [:x, :y, :z], rev=true), on = [:x, :y], makeunique = true, method=:hash) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,100, 300,300,300])
+    @test closejoin(dsl, sort(dsr, [:x, :y, :z], rev=true), on = [:x, :y], makeunique = true, method=:hash, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing])
+    @test closejoin(dsl, sort(dsr, [:x, :y, :z], rev=true), on = [:x, :y], makeunique = true, method=:hash, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300])
 
     dsl = Dataset(x = [1,1,1,2,2,2], y = PooledArray([6,4,1,2,5,3]))
     dsr = Dataset(x = [1,1,2], y = ([0,3,1]), z=[100,200,300])
@@ -1046,10 +1115,31 @@ closefinance_tol10ms_noexact = Dataset([Union{Missing, DateTime}[DateTime("2016-
     @test closejoin(dsl, dsr, on = [:x, :y], makeunique = true, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing])
     @test closejoin(dsl, dsr, on = [:x, :y], makeunique = true, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300])
 
+    @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,100, 300,300,300])
+    @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing])
+    @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300])
+    
+    @test closejoin(dsl, sort(dsr, [:x, :y, :z]), on = [:x, :y], makeunique = true) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,100, 300,300,300])
+    @test closejoin(dsl, sort(dsr, [:x, :y, :z]), on = [:x, :y], makeunique = true, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing])
+    @test closejoin(dsl, sort(dsr, [:x, :y, :z]), on = [:x, :y], makeunique = true, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300])
+
     @test closejoin(dsl, dsr, on = [:x, :y], method = :hash, makeunique = true) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,100, 300,300,300])
     @test closejoin(dsl, dsr, on = [:x, :y], method = :hash, makeunique = true, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing])
     @test closejoin(dsl, dsr, on = [:x, :y], method = :hash, makeunique = true, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300])
 
+    @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true, method=:hash) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,100, 300,300,300])
+    @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true, method=:hash, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing])
+    @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true, method=:hash, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300])
+    
+    @test closejoin(dsl, sort(dsr, [:x, :y, :z]), on = [:x, :y], makeunique = true, method=:hash) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,100, 300,300,300])
+    @test closejoin(dsl, sort(dsr, [:x, :y, :z]), on = [:x, :y], makeunique = true, method=:hash, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing])
+    @test closejoin(dsl, sort(dsr, [:x, :y, :z]), on = [:x, :y], makeunique = true, method=:hash, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300])
+
+    pushfirst!(dsr, (1,4,100))
+    @test closejoin(dsl, dsr, on = [:x, :y], makeunique = true, direction=:forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, 100, 200, missing, missing, missing])
+    @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true, direction=:forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, 100, 200, missing, missing, missing])
+    @test closejoin(dsl, sort(dsr, [:x, :y], rev=true), on = [:x, :y], makeunique = true, direction=:forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, 100, 200, missing, missing, missing])
+
     #views
     for i in 1:100
         l_ridx= rand(1:100, 200)
@@ -1062,6 +1152,14 @@ closefinance_tol10ms_noexact = Dataset([Union{Missing, DateTime}[DateTime("2016-
         @test leftjoin(view(dsl, l_ridx, l_cidx), dsr, on =[:x1], makeunique=true, check = false) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), dsr, on =[:x1], makeunique=true, check = false)
         @test innerjoin(view(dsl, l_ridx, l_cidx), dsr, on =[:x1], makeunique=true, check = false) == innerjoin(Dataset(view(dsl, l_ridx, l_cidx)), dsr, on =[:x1], makeunique=true, check = false)
         @test outerjoin(view(dsl, l_ridx, l_cidx), dsr, on =[:x1], makeunique=true, check = false) == outerjoin(Dataset(view(dsl, l_ridx, l_cidx)), dsr, on =[:x1], makeunique=true, check = false)
+
+
+        @test leftjoin(view(dsl, l_ridx, l_cidx), sort(dsr, [:x1, :x2]), on =[:x1, :x2], makeunique=true, check = false) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), sort(dsr, [:x1, :x2]), on =[:x1, :x2], makeunique=true, check = false)
+        @test innerjoin(view(dsl, l_ridx, l_cidx), sort(dsr, [:x1, :x2]), on =[:x1, :x2], makeunique=true, check = false) == innerjoin(Dataset(view(dsl, l_ridx, l_cidx)), sort(dsr, [:x1, :x2]), on =[:x1, :x2], makeunique=true, check = false)
+        @test outerjoin(view(dsl, l_ridx, l_cidx), sort(dsr, [:x1, :x2]), on =[:x1, :x2], makeunique=true, check = false) == outerjoin(Dataset(view(dsl, l_ridx, l_cidx)), sort(dsr, [:x1, :x2]), on =[:x1, :x2], makeunique=true, check = false)
+        @test leftjoin(view(dsl, l_ridx, l_cidx), sort(dsr, [:x1, :x2]), on =[:x1], makeunique=true, check = false) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), sort(dsr, [:x1, :x2]), on =[:x1], makeunique=true, check = false)
+        @test innerjoin(view(dsl, l_ridx, l_cidx), sort(dsr, [:x1, :x2]), on =[:x1], makeunique=true, check = false) == innerjoin(Dataset(view(dsl, l_ridx, l_cidx)), sort(dsr, [:x1, :x2]), on =[:x1], makeunique=true, check = false)
+        @test outerjoin(view(dsl, l_ridx, l_cidx), sort(dsr, [:x1, :x2]), on =[:x1], makeunique=true, check = false) == outerjoin(Dataset(view(dsl, l_ridx, l_cidx)), sort(dsr, [:x1, :x2]), on =[:x1], makeunique=true, check = false)
     end
 
     for i in 1:100
@@ -1429,53 +1527,51 @@ end
           s([:id, :fid]) == Dataset([[1, 3], [1, 3]], [:id, :fid])
     @test typeof.(eachcol(s(:id))) ==
           typeof.(eachcol(s(:fid))) ==
-          typeof.(eachcol(s([:id, :fid]))) == [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}}]
+          typeof.(eachcol(s([:id, :fid]))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}]
     @test a(:id) ==
           a(:fid) ==
           a([:id, :fid]) == Dataset([[5], [5]], [:id, :fid])
     @test typeof.(eachcol(a(:id))) ==
           typeof.(eachcol(a(:fid))) ==
-          typeof.(eachcol(a([:id, :fid]))) == [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}}]
+          typeof.(eachcol(a([:id, :fid]))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}]
 
     on = :id
     @test i(on) == Dataset([[1, 3], [1, 3], [1, 3]], [:id, :fid, :fid_1])
-    @test typeof.(eachcol(i(on))) == [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}}, Vector{Union{Missing, Float64}}]
+    @test typeof.(eachcol(i(on))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}]
     @test l(on) ≅ Dataset(id = [1, 3, 5],
                             fid = [1, 3, 5],
                             fid_1 = [1, 3, missing])
     @test typeof.(eachcol(l(on))) ==
-        [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}}, Vector{Union{Float64, Missing}}]
+    [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}]
 
 
     @test o(on) ≅ Dataset(id = [1, 3, 5, 0, 2, 4],
                             fid = [1, 3, 5, missing, missing, missing],
                             fid_1 = [1, 3, missing, 0, 2, 4])
     @test typeof.(eachcol(o(on))) ==
-        [Vector{Union{Missing, Int}}, Vector{Union{Float64, Missing}}, Vector{Union{Float64, Missing}}]
+    [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}]
 
     on = :fid
     @test i(on) == Dataset([[1, 3], [1.0, 3.0], [1, 3]], [:id, :fid, :id_1])
-    @test typeof.(eachcol(i(on))) == [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}}, Vector{Union{Missing, Int}}]
+    @test typeof.(eachcol(i(on))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Int}}}]
     @test l(on) ≅ Dataset(id = [1, 3, 5],
                             fid = [1, 3, 5],
                             id_1 = [1, 3, missing])
-    @test typeof.(eachcol(l(on))) == [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}},
-                                     Vector{Union{Int, Missing}}]
+    @test typeof.(eachcol(l(on))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Int}}}]
 
     @test o(on) ≅ Dataset(id = [1, 3, 5, missing, missing, missing],
                             fid = [1, 3, 5, 0, 2, 4],
                             id_1 = [1, 3, missing, 0, 2, 4])
-    @test typeof.(eachcol(o(on))) == [Vector{Union{Int, Missing}}, Vector{Union{Missing, Float64}},
-                                     Vector{Union{Int, Missing}}]
+    @test typeof.(eachcol(o(on))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Int}}}]
 
     on = [:id, :fid]
     @test i(on) == Dataset([[1, 3], [1, 3]], [:id, :fid])
-    @test typeof.(eachcol(i(on))) == [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}}]
+    @test typeof.(eachcol(i(on))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}]
     @test l(on) == Dataset(id = [1, 3, 5], fid = [1, 3, 5])
-    @test typeof.(eachcol(l(on))) == [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}}]
+    @test typeof.(eachcol(l(on))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}]
 
     @test o(on) == Dataset(id = [1, 3, 5, 0, 2, 4], fid = [1, 3, 5, 0, 2, 4])
-    @test typeof.(eachcol(o(on))) == [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}}]
+    @test typeof.(eachcol(o(on))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}]
 
 
     i_hash(on) = innerjoin(ds1, ds2, on = on, makeunique=true, method = :hash)
@@ -1489,53 +1585,52 @@ end
           s_hash([:id, :fid]) == Dataset([[1, 3], [1, 3]], [:id, :fid])
     @test typeof.(eachcol(s_hash(:id))) ==
           typeof.(eachcol(s_hash(:fid))) ==
-          typeof.(eachcol(s_hash([:id, :fid]))) == [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}}]
+          typeof.(eachcol(s_hash([:id, :fid]))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}]
     @test a_hash(:id) ==
           a_hash(:fid) ==
           a_hash([:id, :fid]) == Dataset([[5], [5]], [:id, :fid])
     @test typeof.(eachcol(a_hash(:id))) ==
           typeof.(eachcol(a_hash(:fid))) ==
-          typeof.(eachcol(a_hash([:id, :fid]))) == [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}}]
+          typeof.(eachcol(a_hash([:id, :fid]))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}]
 
     on = :id
     @test i_hash(on) == Dataset([[1, 3], [1, 3], [1, 3]], [:id, :fid, :fid_1])
-    @test typeof.(eachcol(i_hash(on))) == [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}}, Vector{Union{Missing, Float64}}]
+    @test typeof.(eachcol(i_hash(on))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}]
     @test l_hash(on) ≅ Dataset(id = [1, 3, 5],
                             fid = [1, 3, 5],
                             fid_1 = [1, 3, missing])
     @test typeof.(eachcol(l_hash(on))) ==
-        [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}}, Vector{Union{Float64, Missing}}]
+        [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}},DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}]
 
 
     @test o_hash(on) ≅ Dataset(id = [1, 3, 5, 0, 2, 4],
                             fid = [1, 3, 5, missing, missing, missing],
                             fid_1 = [1, 3, missing, 0, 2, 4])
     @test typeof.(eachcol(o_hash(on))) ==
-        [Vector{Union{Missing, Int}}, Vector{Union{Float64, Missing}}, Vector{Union{Float64, Missing}}]
+    [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}},DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}]
 
     on = :fid
     @test i_hash(on) == Dataset([[1, 3], [1.0, 3.0], [1, 3]], [:id, :fid, :id_1])
-    @test typeof.(eachcol(i_hash(on))) == [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}}, Vector{Union{Missing, Int}}]
+    @test typeof.(eachcol(i_hash(on))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Int}}}]
     @test l_hash(on) ≅ Dataset(id = [1, 3, 5],
                             fid = [1, 3, 5],
                             id_1 = [1, 3, missing])
-    @test typeof.(eachcol(l_hash(on))) == [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}},
-                                     Vector{Union{Int, Missing}}]
+    @test typeof.(eachcol(l_hash(on))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}},
+                                     DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}]
 
     @test o_hash(on) ≅ Dataset(id = [1, 3, 5, missing, missing, missing],
                             fid = [1, 3, 5, 0, 2, 4],
                             id_1 = [1, 3, missing, 0, 2, 4])
-    @test typeof.(eachcol(o_hash(on))) == [Vector{Union{Int, Missing}}, Vector{Union{Missing, Float64}},
-                                     Vector{Union{Int, Missing}}]
+    @test typeof.(eachcol(o_hash(on))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}},DatasetColumn{Dataset, Vector{Union{Missing, Int}}}]
 
     on = [:id, :fid]
     @test i_hash(on) == Dataset([[1, 3], [1, 3]], [:id, :fid])
-    @test typeof.(eachcol(i_hash(on))) == [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}}]
+    @test typeof.(eachcol(i_hash(on))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}]
     @test l_hash(on) == Dataset(id = [1, 3, 5], fid = [1, 3, 5])
-    @test typeof.(eachcol(l_hash(on))) == [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}}]
+    @test typeof.(eachcol(l_hash(on))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}]
 
     @test o_hash(on) == Dataset(id = [1, 3, 5, 0, 2, 4], fid = [1, 3, 5, 0, 2, 4])
-    @test typeof.(eachcol(o_hash(on))) == [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}}]
+    @test typeof.(eachcol(o_hash(on))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}]
 
     #####
     dsl = Dataset(x=[1,2], y=[3,4])
diff --git a/test/stats.jl b/test/stats.jl
index 3427e670..60a8e115 100644
--- a/test/stats.jl
+++ b/test/stats.jl
@@ -163,4 +163,57 @@ end
     @test isequal(IMD.cumprod(x4, missings = :skip), [missing,missing,missing,2])
     @test isequal(IMD.cumprod(x5, missings = :skip), [missing,missing,-9.0,-18.0])
     @test isequal(IMD.cumprod(x6, missings = :skip), [missing,missing, missing, missing])
+end
+@testset "IMD.sum & IMD.mean & IMD.var" begin
+    x = Union{Missing, Int32}[missing, missing, missing, missing]
+    @test isequal(IMD.sum(x), missing)
+    @test IMD.sum(y->ismissing(y) ? 1 : y, x) == 4
+    push!(x, 1)
+    @test IMD.sum(x) == 1
+    @test IMD.sum(y->ismissing(y) ? 1 : y, x) == 5
+
+    @test IMD.mean(x) == 1
+    @test ismissing(IMD.mean(y->isequal(y,1) ? missing : y, x) )
+    @test IMD.mean(y->ismissing(y) ? 1 : y, x) == 1
+
+    @test isequal(IMD.var(x),missing)
+    @test isequal(IMD.var(x, false), 0.0)
+
+    @test isequal(IMD.var(y->ismissing(y) ? 1 : y, x), 0.0)
+    @test isequal(IMD.var(y->ismissing(y) ? 1 : y, x, false), 0.0)
+
+    x = [true, false, true, missing]
+    @test IMD.sum(x) == 2
+    @test IMD.sum(y->isequal(y, true) ? 100 : y, x) == 200
+
+    for i in 1:10
+        x=rand(1:10000, 100)
+        @test IMD.sum(x) == sum(x)
+        x = allowmissing(x)
+        x[50] = missing
+        @test IMD.sum(y->ismissing(y) ? 0 : y, x) == sum(y->ismissing(y) ? 0 : y, x)
+    end
+    if VERSION > v"1.8" # it causes problem in v"1.6", however, we can ignore it for those versions
+        x = rand(10)
+        n_a = [@allocated IMD.sum(x) for _ in 1:10]
+        @test n_a[end] <= 16
+
+        x = Union{Int32, Missing}[1,2,missing, 4]
+        n_a = [@allocated IMD.sum(x) for _ in 1:10]
+        @test n_a[end] == 0
+
+        n_a = [@allocated IMD.sum(y->ismissing(y) ? 0 : y, x) for _ in 1:10]
+        @test n_a[end] <= 16
+
+        x = rand(10)
+        n_a = [@allocated IMD.mean(x) for _ in 1:10]
+        @test n_a[end] <= 16
+
+        x = Union{Int32, Missing}[1,2,missing, 4]
+        n_a = [@allocated IMD.mean(x) for _ in 1:10]
+        @test n_a[end] <= 16
+
+        n_a = [@allocated IMD.mean(y->ismissing(y) ? 0 : y, x) for _ in 1:10]
+        @test n_a[end] <= 16
+    end
 end
\ No newline at end of file