diff --git a/CHANGELOG.md b/CHANGELOG.md index e5e94808..1ea56bf6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,47 @@ +# Version 0.7.18 + +## Features + +* Add `byrow(allequal)` as a special case of `byrow(isequal)`. (This feature need at least Julia 1.8) + +## Fixes + +* Fix bug in stat routines - some corner cases +* Fix unnecessary allocations in stat routins + +# Version 0.7.17 + +## Fixes + +* Fix a performance issue in `sort` due to the recent change in `Threads.@threads`. +* Fix the allocation problem in computing `var` and `std` in fast path of `gatherby`. +* Fix an issue with Julia-latest. + +## Performance + +* Now we exploit multithreading during gathering observation for huge data sets. + +# Version 0.7.16 + +## Fixes + +* Fix a problem that was causing tests fail in Julia 1.9 +* Fix an issue with `eltype` and the output of `eachcol`. Now `eltype(::Type{<:DatasetColumns})` properly returns `AbstractDatasetColumn` instead of `AbstractVector`. +* Fix a problem with `nonmissingtype` with `Union{}` output. +* Fix an issue that was causing the join functions sort already-sorted data sets, [issue #108](https://github.com/sl-solution/InMemoryDatasets.jl/issues/108) +* Remove precompilation for Julia 1.9 - it causes enormous amount of allocation in precompiling and loading + +## Features + +* Now `IMD` throws errors when accesses a grouped data set which its parent is modified. + +# Version 0.7.15 + +## Fixes + +* Functions `searchsorted`, `searchsortedfirst`, and `searchsortedlast` now works with `DatasetColumn` +* Fix a bug in `byrow(nunique)` + # Version 0.7.14 ## Fixes diff --git a/Project.toml b/Project.toml index 655523e3..5684a27f 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "InMemoryDatasets" uuid = "5c01b14b-ab03-46ff-b164-14c663efdd9f" authors = ["sl-solution and contributors"] -version = "0.7.14" +version = "0.7.21" [deps] Compat = "34da2185-b29b-5c13-b0c7-acf172513d20" @@ -22,7 +22,7 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] Compat = "3.17, 4" -DataAPI = "1.8" +DataAPI = "1.16" InvertedIndices = "1" IteratorInterfaceExtensions = "0.1.1, 1" Missings = "0.4.2, 1" diff --git a/README.md b/README.md index 22faca67..6da506ab 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ we do our best to keep the overall complexity of the package as low as possible * adding new features to the package * contributing to the package -See [here](https://discourse.julialang.org/t/ann-a-new-lightning-fast-package-for-data-manipulation-in-pure-julia/78197) for some benchmarks. +See [here](https://duckdblabs.github.io/db-benchmark/) for some benchmarks. # Features `InMemoryDatasets.jl` has many interesting features, here, we highlight some of our favourites (in no particular order): diff --git a/docs/src/index.md b/docs/src/index.md index 52b028f6..cc7af3a8 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -5,7 +5,7 @@ Welcome to the InMemoryDatasets.jl documentation! This resource aims to teach you everything you need to know to get up and running with the InMemoryDatasets.jl package. -In memory Datasets is a collection of tools for working (manipulating, wrangling, cleaning, summarising,...) with tabular data in Julia. +InMemoryDatasets is a collection of tools for working (manipulating, wrangling, cleaning, summarising,...) with tabular data in Julia. If you are new to InMemoryDatasets.jl, probably **[First steps with Datasets](https://sl-solution.github.io/InMemoryDatasets.jl/stable/man/basics/)** or **[Tutorial](https://sl-solution.github.io/InMemoryDatasets.jl/stable/man/tutorial/)** in manual should be good starting points. diff --git a/docs/src/man/grouping.md b/docs/src/man/grouping.md index 51dc008c..36074fea 100644 --- a/docs/src/man/grouping.md +++ b/docs/src/man/grouping.md @@ -155,6 +155,21 @@ julia> groupby(salary, 2) 10 2 3 3 5 3 + +julia> ds = Dataset(x=[1,1,2,2], y=[1,2,1,2], z=[1,1,1,1]) + +julia> groupby!(ds, [:x, :y]) # groupby by more than one column +4×3 Grouped Dataset with 4 groups +Grouped by: x, y + Row │ x y z + │ identity identity identity + │ Int64? Int64? Int64? +─────┼────────────────────────────── + 1 │ 1 1 1 + 2 │ 1 2 1 + 3 │ 2 1 1 + 4 │ 2 2 1 + ``` The `groupby!` and `groupby` functions accept the output of the `groupby` function. Thus, some may use these functions to incrementally group a data set. diff --git a/src/InMemoryDatasets.jl b/src/InMemoryDatasets.jl index 6dfdcc6a..69fb69a8 100644 --- a/src/InMemoryDatasets.jl +++ b/src/InMemoryDatasets.jl @@ -25,6 +25,7 @@ import DataAPI, DataAPI.antijoin, DataAPI.nrow, DataAPI.ncol, + DataAPI.groupby, # DataAPI.crossjoin, Tables, Tables.columnindex @@ -119,7 +120,8 @@ export update! - +SMALLSIGNED = Union{Int16, Int32, Int8} +SMALLUNSIGNED = Union{UInt16, UInt32, UInt8} include("other/index.jl") @@ -197,7 +199,8 @@ include("stat/ds_stat.jl") include("precompile/precompile.jl") include("precompile/warmup.jl") include("precompile/create_sysimage.jl") -_precompile() +# FIXME currently v1.9.0 precompilation and loading cause an enormous amount of allocation - v1.10 seems ok +VERSION != v"1.9.0" && _precompile() function __init__() if Threads.nthreads() == 1 diff --git a/src/abstractdataset/abstractdataset.jl b/src/abstractdataset/abstractdataset.jl index 28f6b044..549938a5 100644 --- a/src/abstractdataset/abstractdataset.jl +++ b/src/abstractdataset/abstractdataset.jl @@ -42,16 +42,18 @@ In broadcasting `AbstractDataset` behavior is similar to a `Matrix`. """ abstract type AbstractDataset end +abstract type AbstractDatasetColumn end + # DatasetColumn is a representation of a column of data set # it is wrapped into a new type to make sure that when ever a column is # selected, the data set is attached to it -struct DatasetColumn{T <: AbstractDataset, E} +struct DatasetColumn{T <: AbstractDataset, E} <: AbstractDatasetColumn col::Int ds::T val::E end -struct SubDatasetColumn{T <: AbstractDataset, E} +struct SubDatasetColumn{T <: AbstractDataset, E} <: AbstractDatasetColumn col::Int ds::T val::E @@ -308,7 +310,7 @@ function content(ds::AbstractDataset; output = false) for i in 1:ncol(ds) push!(f_v[1], all_names[i]) push!(f_v[2], getformat(ds, i)) - push!(f_v[3], nonmissingtype(eltype(ds[!, i]))) + push!(f_v[3], our_nonmissingtype(eltype(ds[!, i]))) end format_ds = Dataset(f_v, [:column, :format, :eltype], copycols = false) if !output diff --git a/src/abstractdataset/dscol.jl b/src/abstractdataset/dscol.jl index b5f9cf9f..be2dc40d 100644 --- a/src/abstractdataset/dscol.jl +++ b/src/abstractdataset/dscol.jl @@ -8,6 +8,8 @@ const SubOrDSCol = Union{SubDatasetColumn,DatasetColumn} # isequal also use for == , since we don't want missing be annoying Base.parent(col1::DatasetColumn) = col1.ds +Base.eachindex(col1::SubOrDSCol) = Base.axes1(col1) + Base.length(col1::SubOrDSCol) = length(__!(col1)) Base.size(col1::SubOrDSCol) = size(__!(col1)) Base.size(col1::SubOrDSCol, i::Integer) = size(__!(col1), i) @@ -18,12 +20,14 @@ Base.eltype(col1::SubOrDSCol) = eltype(__!(col1)) Base.ndims(col1::SubOrDSCol) = ndims(__!(col1)) Base.ndims(::Type{<:SubDatasetColumn}) = 1 Base.isassigned(col1::SubOrDSCol, i) = isassigned(__!(col1), i) +# FIXME: unsafe method - an alias of col1 is out and it can be modified without any control Base.identity(col1::SubOrDSCol) = identity(__!(col1)) Base.similar(col1::SubOrDSCol, args...) = similar(__!(col1), args...) Base.copy(col1::SubOrDSCol) = copy(__!(col1)) Base.pairs(col1::SubOrDSCol) = pairs(IndexLinear(), __!(col1)) Base.iterate(col1::SubOrDSCol, kwargs...) = iterate(__!(col1), kwargs...) PooledArrays.PooledArray(col1::SubOrDSCol; arg...) = PooledArray(__!(col1); arg...) +# FIXME: unsafe when alias are created Base.convert(T::Type{<:AbstractVector}, col1::SubOrDSCol) = convert(T, __!(col1)) DataAPI.refarray(col::SubOrDSCol) = DataAPI.refarray(__!(col)) DataAPI.refpool(col::SubOrDSCol) = DataAPI.refpool(__!(col)) @@ -160,3 +164,8 @@ function Base.sort!(col::SubOrDSCol; kws...) end Base.sort(col::SubOrDSCol; kws...) = sort(__!(col); kws...) Base.sortperm(col::SubOrDSCol; kws...) = sortperm(__!(col); kws...) + +Base.searchsortedfirst(col::SubOrDSCol, x; kws...) = searchsortedfirst(__!(col), x; kws...) +Base.searchsortedlast(col::SubOrDSCol, x; kws...) = searchsortedlast(__!(col), x; kws...) +Base.searchsorted(col::SubOrDSCol, x; kws...) = searchsorted(__!(col), x; kws...) + diff --git a/src/abstractdataset/iteration.jl b/src/abstractdataset/iteration.jl index 987a54b5..b3156936 100644 --- a/src/abstractdataset/iteration.jl +++ b/src/abstractdataset/iteration.jl @@ -234,7 +234,7 @@ Base.ndims(::DatasetColumns) = 1 Base.ndims(::Type{<:DatasetColumns}) = 1 Base.length(itr::DatasetColumns) = size(itr)[1] -Base.eltype(::Type{<:DatasetColumns}) = AbstractVector +Base.eltype(::Type{<:DatasetColumns}) = AbstractDatasetColumn Base.firstindex(itr::DatasetColumns) = 1 Base.lastindex(itr::DatasetColumns) = length(itr) @@ -394,7 +394,10 @@ Base.show(dfcs::DatasetColumns; # prevent using broadcasting to mutate columns e.g. in pop!.(eachcol(ds)) # TODO customise Base.broadcasted to handle the situation for f in filter(x->occursin(r"!$", String(x)), names(Base)) - @eval Base.broadcasted(::typeof($f), ::DatasetColumns, args...) = throw(ArgumentError("broadcasting `$(nameof($f))` over DatasetColums is reserved.")) + # FIXME due to a bug in Julia > 1.11 !? + if isdefined(Main, f) + @eval Base.broadcasted(::typeof($f), ::DatasetColumns, args...) = throw(ArgumentError("broadcasting `$(nameof($f))` over DatasetColums is reserved.")) + end end for f in filter(x->occursin(r"!$", String(x)), names(Statistics)) @eval Base.broadcasted(::typeof($f), ::DatasetColumns, args...) = throw(ArgumentError("broadcasting `$(nameof($f))` over DatasetColums is reserved.")) diff --git a/src/abstractdataset/show.jl b/src/abstractdataset/show.jl index a1f3c4a5..8911bb01 100644 --- a/src/abstractdataset/show.jl +++ b/src/abstractdataset/show.jl @@ -118,7 +118,7 @@ function compacttype(T::Type, maxwidth::Int=8) textwidth(sT) ≤ maxwidth && return sT if T >: Missing - T = nonmissingtype(T) + T = our_nonmissingtype(T) sT = string(T) suffix = "?" textwidth(sT) ≤ maxwidth && return sT * suffix @@ -223,7 +223,7 @@ function _show(io::IO, alignment_regex_complex = [r"(? Threads.nthreads()*10) = row_isequal(ds, cols, by = with, threads = threads) byrow(ds::AbstractDataset, ::typeof(isequal), cols::ColumnIndex; with = nothing, threads = nrow(ds) > Threads.nthreads()*10) = row_isequal(ds, cols, by = with, threads = threads) +if VERSION >= v"1.8" + byrow(ds::AbstractDataset, ::typeof(allequal), cols::MultiColumnIndex; threads = nrow(ds) > Threads.nthreads()*10) = row_isequal(ds, cols, by = nothing, threads = threads) + byrow(ds::AbstractDataset, ::typeof(allequal), cols::ColumnIndex; threads = nrow(ds) > Threads.nthreads()*10) = row_isequal(ds, cols, by = nothing, threads = threads) +end + + byrow(ds::AbstractDataset, ::typeof(isless), cols::MultiColumnIndex; with, threads = nrow(ds) > Threads.nthreads()*10, rev::Bool = false, lt = isless) = row_isless(ds, cols, with, threads = threads, rev = rev, lt = lt) byrow(ds::AbstractDataset, ::typeof(isless), col::ColumnIndex; with, threads = nrow(ds) > Threads.nthreads()*10, rev::Bool = false, lt = isless) = row_isless(ds, [col], with, threads = threads, rev = rev, lt = lt) @@ -167,7 +173,14 @@ byrow(ds::AbstractDataset, ::typeof(var), col::ColumnIndex; by = identity, dof = byrow(ds::AbstractDataset, ::typeof(std), cols::MultiColumnIndex = names(ds, Union{Missing, Number}); by = identity, dof = true, threads = nrow(ds) > Threads.nthreads()*10) = row_std(ds, by, cols; dof = dof, threads = threads) byrow(ds::AbstractDataset, ::typeof(std), col::ColumnIndex; by = identity, dof = true, threads = nrow(ds) > Threads.nthreads()*10) = byrow(ds, std, [col]; by = by, dof = dof, threads = threads) -byrow(ds::AbstractDataset, ::typeof(nunique), cols::MultiColumnIndex = names(ds, Union{Missing, Number}); by = identity, count_missing = true) = row_nunique(ds, by, cols; count_missing = count_missing) +function byrow(ds::AbstractDataset, ::typeof(nunique), cols::MultiColumnIndex = names(ds, Union{Missing, Number}); by = identity, count_missing = true, threads=nrow(ds)>1000) + res = byrow(ds, x->length(Set(Base.Generator(by, x))), cols, threads=threads) + if count_missing + return res + else + return res .- row_any(ds, ismissing, cols) + end +end byrow(ds::AbstractDataset, ::typeof(nunique), col::ColumnIndex; by = identity, count_missing = true) = byrow(ds, nunique, [col]; by = by, count_missing = count_missing) @@ -254,7 +267,7 @@ end function byrow(ds::AbstractDataset, f::Function, col::ColumnIndex; threads = nrow(ds)>1000, allowmissing::Bool = true) if threads - T = Core.Compiler.return_type(f, Tuple{nonmissingtype(eltype(ds[!, col]))}) + T = Core.Compiler.return_type(f, Tuple{our_nonmissingtype(eltype(ds[!, col]))}) if allowmissing res = Vector{Union{Missing, T}}(undef, nrow(ds)) else @@ -262,7 +275,7 @@ function byrow(ds::AbstractDataset, f::Function, col::ColumnIndex; threads = nro end _hp_map_a_function!(res, f, _columns(ds)[index(ds)[col]]) else - T = Core.Compiler.return_type(f, Tuple{nonmissingtype(eltype(ds[!, col]))}) + T = Core.Compiler.return_type(f, Tuple{our_nonmissingtype(eltype(ds[!, col]))}) if allowmissing res = Vector{Union{Missing, T}}(undef, nrow(ds)) else diff --git a/src/byrow/doc.jl b/src/byrow/doc.jl index 746ac473..a7a1f793 100644 --- a/src/byrow/doc.jl +++ b/src/byrow/doc.jl @@ -16,6 +16,8 @@ function Docs.getdoc(x::typeof(byrow), y) return _get_doc_byrow("prod") elseif y == Tuple{typeof(isequal)} return _get_doc_byrow("isequal") + elseif VERSION >= v"1.8" && y == Tuple{typeof(allequal)} + return _get_doc_byrow("allequal") elseif y == Tuple{typeof(isless)} return _get_doc_byrow("isless") elseif y == Tuple{typeof(in)} @@ -105,6 +107,7 @@ Perform a row-wise operation specified by `fun` on selected columns `cols`. Gene # Reduction operations - `all` +- `allequal` (this needs Julia 1.8 or later) - `any` - `argmax` - `argmin` @@ -369,6 +372,14 @@ julia> byrow(ds, isequal, [1,2], with = [2,2,2,3,3,3]) 0 0 ``` +@@@@allequal@@@@ + byrow(ds::AbstractDataset, allequal, cols; [threads]) + +Returns a boolean vector which is `true` if all values in the corresponding row are equal (using `isequal`). + +Passing `threads = false` disables multithreaded computations. + +See [`byrow(isequal)`](@ref), [`byrow(isless)`](@ref), [`byrow(in)`](@ref), [`byrow(issorted)`](@ref) @@@@isless@@@@ byrow(ds::AbstractDataset, isless, cols, [with, threads, rev = false, lt = isless]) diff --git a/src/byrow/hp_row_functions.jl b/src/byrow/hp_row_functions.jl index 371e0cc3..40abfda5 100644 --- a/src/byrow/hp_row_functions.jl +++ b/src/byrow/hp_row_functions.jl @@ -1,6 +1,6 @@ function hp_row_sort!(ds::Dataset, cols = names(ds, Union{Missing, Number}); kwargs...) colsidx = index(ds)[cols] - T = mapreduce(eltype, promote_type, eachcol(ds)[colsidx]) + T = mapreduce(eltype, promote_type, view(_columns(ds),colsidx)) m = Matrix{T}(ds[!, colsidx]) Threads.@threads for i in 1:size(m, 1) @views sort!(m[i, :]; kwargs...) @@ -92,7 +92,7 @@ function _hp_row_generic_vec!(res, ds, f, colsidx, ::Val{T}) where T max_cz = length(res) - 1000 - (loopsize - 1)*1000 inmat_all = [Matrix{T}(undef, length(colsidx), max_cz) for i in 1:nt] # make sure that the variable inside the loop are not the same as the out of scope one - Threads.@threads for i in 1:loopsize + Threads.@threads :static for i in 1:loopsize t_st = i*1000 + 1 i == loopsize ? t_en = length(res) : t_en = (i+1)*1000 _fill_matrix!(inmat_all[Threads.threadid()], all_data, t_st:t_en, colsidx) diff --git a/src/byrow/row_functions.jl b/src/byrow/row_functions.jl index f7f4058e..2d7df23a 100644 --- a/src/byrow/row_functions.jl +++ b/src/byrow/row_functions.jl @@ -33,9 +33,9 @@ function row_sum(ds::AbstractDataset, f::Function, cols = names(ds, Union{Missi colsidx = multiple_getindex(index(ds), cols) CT = mapreduce(eltype, promote_type, view(_columns(ds),colsidx)) T = Core.Compiler.return_type(f, Tuple{CT}) - CT = nonmissingtype(T) - CT <: Base.SmallSigned ? CT = Int : nothing - CT <: Base.SmallUnsigned ? CT = UInt : nothing + CT = our_nonmissingtype(T) + CT <: SMALLSIGNED ? CT = Int : nothing + CT <: SMALLUNSIGNED ? CT = UInt : nothing CT <: Bool ? CT = Int : nothing T = Union{Missing, CT} init0 = _missings(T, nrow(ds)) @@ -68,9 +68,9 @@ function row_prod(ds::AbstractDataset, f::Function, cols = names(ds, Union{Missi colsidx = multiple_getindex(index(ds), cols) CT = mapreduce(eltype, promote_type, view(_columns(ds),colsidx)) T = Core.Compiler.return_type(f, Tuple{CT}) - CT = nonmissingtype(T) - CT <: Base.SmallSigned ? CT = Int : nothing - CT <: Base.SmallUnsigned ? CT = UInt : nothing + CT = our_nonmissingtype(T) + CT <: SMALLSIGNED ? CT = Int : nothing + CT <: SMALLUNSIGNED ? CT = UInt : nothing CT <: Bool ? CT = Int : nothing T = Union{Missing, CT} init0 = _missings(T, nrow(ds)) @@ -744,9 +744,9 @@ function row_cumsum!(ds::Dataset, cols = names(ds, Union{Missing, Number}); miss colsidx = index(ds)[cols] T = mapreduce(eltype, promote_type, view(_columns(ds),colsidx)) if T <: Union{Missing, INTEGERS} - T <: Union{Missing, Base.SmallSigned} - T = T <: Union{Missing, Base.SmallSigned, Bool} ? Union{Int, Missing} : T - T = T <: Union{Missing, Base.SmallUnsigned} ? Union{Missing, UInt} : T + T <: Union{Missing, SMALLSIGNED} + T = T <: Union{Missing, SMALLSIGNED, Bool} ? Union{Int, Missing} : T + T = T <: Union{Missing, SMALLUNSIGNED} ? Union{Missing, UInt} : T end for i in colsidx if eltype(ds[!, i]) >: Missing @@ -1004,7 +1004,7 @@ end function row_sort!(ds::Dataset, cols = names(ds, Union{Missing, Number}); kwargs...) colsidx = index(ds)[cols] - T = mapreduce(eltype, promote_type, eachcol(ds)[colsidx]) + T = mapreduce(eltype, promote_type, view(_columns(ds),colsidx)) m = Matrix{T}(ds[!, colsidx]) sort!(m; dims = 2, kwargs...) for i in 1:length(colsidx) @@ -1077,25 +1077,26 @@ function _fill_dict_and_add!(init0, dict, prehashed, n, p) end end -function row_nunique(ds::AbstractDataset, f::Function, cols = names(ds, Union{Missing, Number}); count_missing = true) - colsidx = multiple_getindex(index(ds), cols) - prehashed = Matrix{_Prehashed}(undef, size(ds,1), length(colsidx)) - allcols = view(_columns(ds),colsidx) +# This is not working - because we only the hash values and in many cases like 2.1 and 4611911198408756429 the hash is the same +# function row_nunique(ds::AbstractDataset, f::Function, cols = names(ds, Union{Missing, Number}); count_missing = true) +# colsidx = multiple_getindex(index(ds), cols) +# prehashed = Matrix{_Prehashed}(undef, size(ds,1), length(colsidx)) +# allcols = view(_columns(ds),colsidx) - for j in 1:size(prehashed,2) - _fill_prehashed!(prehashed, allcols[j], f, size(ds,1), j) - end +# for j in 1:size(prehashed,2) +# _fill_prehashed!(prehashed, allcols[j], f, size(ds,1), j) +# end - init0 = zeros(Int32, size(ds,1)) - dict = Dict{_Prehashed, Nothing}() - _fill_dict_and_add!(init0, dict, prehashed, size(ds,1), length(colsidx)) - if count_missing - return init0 - else - return init0 .- row_any(ds, ismissing, cols) - end -end -row_nunique(ds::AbstractDataset, cols = names(ds, Union{Missing, Number}); count_missing = true) = row_nunique(ds, identity, cols; count_missing = count_missing) +# init0 = zeros(Int32, size(ds,1)) +# dict = Dict{_Prehashed, Nothing}() +# _fill_dict_and_add!(init0, dict, prehashed, size(ds,1), length(colsidx)) +# if count_missing +# return init0 +# else +# return init0 .- row_any(ds, ismissing, cols) +# end +# end +# row_nunique(ds::AbstractDataset, cols = names(ds, Union{Missing, Number}); count_missing = true) = row_nunique(ds, identity, cols; count_missing = count_missing) Base.@propagate_inbounds function _op_for_hash!(x, y, f, lo, hi) @simd for i in lo:hi diff --git a/src/byrow/util.jl b/src/byrow/util.jl index f77a5e47..a42d2814 100644 --- a/src/byrow/util.jl +++ b/src/byrow/util.jl @@ -296,6 +296,11 @@ end return pos end +# before Julia 1.10 these functions where defined in Ryu, however, they moved to Base and their syntax has changed. +# we only use them here so we define them for our purpose +_memcpy(d, doff, s, soff, n) = (ccall(:memcpy, Ptr{Cvoid}, (Ptr{Cvoid}, Ptr{Cvoid}, Csize_t), d + doff - 1, s + soff - 1, n); nothing) +_memmove(d, doff, s, soff, n) = (ccall(:memmove, Ptr{Cvoid}, (Ptr{Cvoid}, Ptr{Cvoid}, Csize_t), d + doff - 1, s + soff - 1, n); nothing) + ### From Base.Ryu, because we need buf to be View of an array not vector (maybe we should change it in Ryu?) function _writeshortest(buf, pos, x::T, plus=false, space=false, hash=true, @@ -423,7 +428,7 @@ function _writeshortest(buf, pos, x::T, end i = 0 ptr = pointer(buf) - ptr2 = pointer(Base.Ryu.DIGIT_TABLE) + ptr2 = pointer(our_DIGIT_TABLE) if (output >> 32) != 0 q = output ÷ 100000000 output2 = (output % UInt32) - UInt32(100000000) * (q % UInt32) @@ -436,10 +441,10 @@ function _writeshortest(buf, pos, x::T, c1 = (c ÷ 100) << 1 d0 = (d % 100) << 1 d1 = (d ÷ 100) << 1 - Base.Ryu.memcpy(ptr, pos + olength - 2, ptr2, c0 + 1, 2) - Base.Ryu.memcpy(ptr, pos + olength - 4, ptr2, c1 + 1, 2) - Base.Ryu.memcpy(ptr, pos + olength - 6, ptr2, d0 + 1, 2) - Base.Ryu.memcpy(ptr, pos + olength - 8, ptr2, d1 + 1, 2) + _memcpy(ptr, pos + olength - 2, ptr2, c0 + 1, 2) + _memcpy(ptr, pos + olength - 4, ptr2, c1 + 1, 2) + _memcpy(ptr, pos + olength - 6, ptr2, d0 + 1, 2) + _memcpy(ptr, pos + olength - 8, ptr2, d1 + 1, 2) i += 8 end output2 = output % UInt32 @@ -448,20 +453,20 @@ function _writeshortest(buf, pos, x::T, output2 = div(output2, UInt32(10000)) c0 = (c % 100) << 1 c1 = (c ÷ 100) << 1 - Base.Ryu.memcpy(ptr, pos + olength - i - 2, ptr2, c0 + 1, 2) - Base.Ryu.memcpy(ptr, pos + olength - i - 4, ptr2, c1 + 1, 2) + _memcpy(ptr, pos + olength - i - 2, ptr2, c0 + 1, 2) + _memcpy(ptr, pos + olength - i - 4, ptr2, c1 + 1, 2) i += 4 end if output2 >= 100 c = (output2 % UInt32(100)) << 1 output2 = div(output2, UInt32(100)) - Base.Ryu.memcpy(ptr, pos + olength - i - 2, ptr2, c + 1, 2) + _memcpy(ptr, pos + olength - i - 2, ptr2, c + 1, 2) i += 2 end if output2 >= 10 c = output2 << 1 - buf[pos + 1] = Base.Ryu.DIGIT_TABLE[c + 2] - buf[pos - exp_form] = Base.Ryu.DIGIT_TABLE[c + 1] + buf[pos + 1] = our_DIGIT_TABLE[c + 2] + buf[pos - exp_form] = our_DIGIT_TABLE[c + 1] else buf[pos - exp_form] = UInt8('0') + (output2 % UInt8) end @@ -498,7 +503,7 @@ function _writeshortest(buf, pos, x::T, end else pointoff = olength - abs(nexp) - Base.Ryu.memmove(ptr, pos + pointoff + 1, ptr, pos + pointoff, olength - pointoff + 1) + _memmove(ptr, pos + pointoff + 1, ptr, pos + pointoff, olength - pointoff + 1) buf[pos + pointoff] = decchar pos += olength + 1 precision -= olength @@ -543,11 +548,11 @@ function _writeshortest(buf, pos, x::T, if exp2 >= 100 c = exp2 % 10 - Base.Ryu.memcpy(ptr, pos, ptr2, 2 * div(exp2, 10) + 1, 2) + _memcpy(ptr, pos, ptr2, 2 * div(exp2, 10) + 1, 2) buf[pos + 2] = UInt8('0') + (c % UInt8) pos += 3 elseif exp2 >= 10 - Base.Ryu.memcpy(ptr, pos, ptr2, 2 * exp2 + 1, 2) + _memcpy(ptr, pos, ptr2, 2 * exp2 + 1, 2) pos += 2 else if padexp @@ -565,3 +570,17 @@ function _writeshortest(buf, pos, x::T, return pos end + +# FIXME in versions > 1.11 julia has change DIGIT_TABLE, we nee to update this for our purpose +const our_DIGIT_TABLE = UInt8[ + '0','0','0','1','0','2','0','3','0','4','0','5','0','6','0','7','0','8','0','9', + '1','0','1','1','1','2','1','3','1','4','1','5','1','6','1','7','1','8','1','9', + '2','0','2','1','2','2','2','3','2','4','2','5','2','6','2','7','2','8','2','9', + '3','0','3','1','3','2','3','3','3','4','3','5','3','6','3','7','3','8','3','9', + '4','0','4','1','4','2','4','3','4','4','4','5','4','6','4','7','4','8','4','9', + '5','0','5','1','5','2','5','3','5','4','5','5','5','6','5','7','5','8','5','9', + '6','0','6','1','6','2','6','3','6','4','6','5','6','6','6','7','6','8','6','9', + '7','0','7','1','7','2','7','3','7','4','7','5','7','6','7','7','7','8','7','9', + '8','0','8','1','8','2','8','3','8','4','8','5','8','6','8','7','8','8','8','9', + '9','0','9','1','9','2','9','3','9','4','9','5','9','6','9','7','9','8','9','9' +] \ No newline at end of file diff --git a/src/dataset/getindex.jl b/src/dataset/getindex.jl index 2fa9645d..3894eeb6 100644 --- a/src/dataset/getindex.jl +++ b/src/dataset/getindex.jl @@ -61,6 +61,8 @@ function _check_consistency(ds::Dataset) end function _check_consistency(ds::AbstractDataset) + # FIXME We should check the created date of sub-data, however, it is not working in some situations, e.g. modify!(sds, ...) + # TODO However, We should add this whenever it is possible : getfield(ds, :created) == _get_lastmodified(_attributes(parent(ds))) if ds isa SubDataset @assert length(index(ds).remap) == length(index(parent(ds))) "The parent data set which this view is based on, has been modified. To fix the issue recreate the view" end diff --git a/src/dataset/modify.jl b/src/dataset/modify.jl index 6f0afe56..adcc20dd 100644 --- a/src/dataset/modify.jl +++ b/src/dataset/modify.jl @@ -255,7 +255,7 @@ function normalize_modify!(outidx::Index, idx, @nospecialize(sel::Pair{<:ColumnIndex, <:Vector{<:Base.Callable}})) colsidx = outidx[sel.first] - normalize_modify!(outidx, idx, colsidx .=> sel.second[i]) + normalize_modify!(outidx, idx, colsidx .=> sel.second) return res end @@ -597,7 +597,7 @@ end # the number of destination can be smaller or greater than the number of elements of Tuple, function _modify_multiple_out!(ds, x, dst) - !(nonmissingtype(eltype(x)) <: Tuple) && throw(ArgumentError("to use `splitter`, the source column must be a vector of Tuple")) + !(our_nonmissingtype(eltype(x)) <: Tuple) && throw(ArgumentError("to use `splitter`, the source column must be a vector of Tuple")) tb = Tables.columntable(x) for j in 1:length(dst) try diff --git a/src/dataset/other.jl b/src/dataset/other.jl index d5ce1e6d..66c9c217 100755 --- a/src/dataset/other.jl +++ b/src/dataset/other.jl @@ -492,7 +492,7 @@ function Base.map!(ds::AbstractDataset, f::Vector{<:Function}, cols::MultiColumn # Core.Compiler.return_type cannot handle the situations like x->ismissing(x) ? 0 : x when x is missing and float, since the output of Core.Compiler.return_type is Union{Missing, Float64, Int64} # we remove missing and then check the result, # TODO is there any problem with this? - T = Core.Compiler.return_type(f[j], Tuple{nonmissingtype(CT)}) + T = Core.Compiler.return_type(f[j], Tuple{our_nonmissingtype(CT)}) T = Union{Missing, T} if promote_type(T, CT) <: CT if threads && DataAPI.refpool(_columns(ds)[colsidx[j]]) === nothing diff --git a/src/dataset/transpose.jl b/src/dataset/transpose.jl index b1effdd5..d132a3d2 100644 --- a/src/dataset/transpose.jl +++ b/src/dataset/transpose.jl @@ -408,7 +408,7 @@ end function _fill_outputmat_withoutid(T, in_cols, ds, starts, perms, new_col_names, row_names_length, threads; default_fill = missing) - @assert _check_allocation_limit(nonmissingtype(T), row_names_length*_ngroups(ds), length(new_col_names)) < 1.0 "The output data set is huge and there is not enough resource, check the passed arguments." + @assert _check_allocation_limit(our_nonmissingtype(T), row_names_length*_ngroups(ds), length(new_col_names)) < 1.0 "The output data set is huge and there is not enough resource, check the passed arguments." CT = promote_type(T, typeof(default_fill)) # outputmat = [__fill!(_our_vect_alloc(CT, row_names_length*_ngroups(ds)), default_fill) for _ in 1:length(new_col_names)] outputmat = Vector{typeof(_our_vect_alloc(CT, 0))}(undef, length(new_col_names)) @@ -420,7 +420,7 @@ end function _fill_outputmat_withid(T, in_cols, ds, starts, perms, ids, new_col_names, row_names_length, threads; default_fill = missing) - @assert _check_allocation_limit(nonmissingtype(T), row_names_length*_ngroups(ds), length(new_col_names)) < 1.0 "The output data set is huge and there is not enough resource, check the passed arguments." + @assert _check_allocation_limit(our_nonmissingtype(T), row_names_length*_ngroups(ds), length(new_col_names)) < 1.0 "The output data set is huge and there is not enough resource, check the passed arguments." CT = promote_type(T, typeof(default_fill)) # outputmat = [fill!(_our_vect_alloc(CT, row_names_length*_ngroups(ds)), default_fill) for _ in 1:length(new_col_names)] outputmat = Vector{typeof(_our_vect_alloc(CT, 0))}(undef, length(new_col_names)) @@ -787,7 +787,7 @@ function flatten!(ds::Dataset, for col in 2:length(idxcols) if mapformats f_fmt = getformat(ds, idxcols[col]) - push!(all_idxcols, byrow(ds, f_fmt, idxcols[col]), threads = threads) + push!(all_idxcols, byrow(ds, f_fmt, idxcols[col], threads = threads)) else push!(all_idxcols, _columns(ds)[idxcols[col]]) end @@ -854,7 +854,7 @@ function flatten(ds::AbstractDataset, for col in 2:length(idxcols) if mapformats f_fmt = getformat(ds, idxcols[col]) - push!(all_idxcols, byrow(ds, f_fmt, idxcols[col]), threads = threads) + push!(all_idxcols, byrow(ds, f_fmt, idxcols[col], threads = threads)) else push!(all_idxcols, _columns(ds)[idxcols[col]]) end diff --git a/src/join/closejoin.jl b/src/join/closejoin.jl index c969dae9..64799912 100644 --- a/src/join/closejoin.jl +++ b/src/join/closejoin.jl @@ -274,7 +274,8 @@ function _fill_right_cols_table_close!(_res, x, ranges, total, borderval, fill_ end -function _change_refpool_find_range_for_close!(ranges, dsl, dsr, r_perms, oncols_left, oncols_right, direction, lmf, rmf, j; nsfpaj = true, threads = true) +function _change_refpool_find_range_for_close!(ranges, dsl, dsr, r_perms, oncols_left, oncols_right, direction, lmf, rmf, j; nsfpaj=nsfpaj, threads = true) + nsfpaj_in = nsfpaj[1] var_l = _columns(dsl)[oncols_left[j]] var_r = _columns(dsr)[oncols_right[j]] l_idx = oncols_left[j] @@ -292,8 +293,8 @@ function _change_refpool_find_range_for_close!(ranges, dsl, dsr, r_perms, oncols T1 = Core.Compiler.return_type(_fl, Tuple{eltype(var_l)}) - if DataAPI.refpool(var_r) !== nothing && nsfpaj - true && throw(ErrorException("we shouldn't end up here")) + if DataAPI.refpool(var_r) !== nothing && nsfpaj_in + throw(ErrorException("we shouldn't end up here")) else T2 = Core.Compiler.return_type(_fr, Tuple{eltype(var_r)}) if direction == :backward @@ -327,10 +328,10 @@ function _join_closejoin(dsl, dsr::AbstractDataset, ::Val{T}; onleft, onright, m throw(ArgumentError("duplicate column names, pass `makeunique = true` to make them unique using a suffix automatically." )) end - nsfpaj = true + nsfpaj = [true] # if the column for close join is a PA we cannot use the fast path if DataAPI.refpool(_columns(dsr)[oncols_right[end]]) !== nothing - nsfpaj = false + nsfpaj = [false] end if length(oncols_left) > 1 && method == :hash ranges, a, idx, minval, reps, sz, right_cols_2= _find_ranges_for_join_using_hash(dsl, dsr, onleft[1:end-1], onright[1:end-1], mapformats, true, Val(T), threads = threads) diff --git a/src/join/join.jl b/src/join/join.jl index 8e0ac90a..4f966e6c 100644 --- a/src/join/join.jl +++ b/src/join/join.jl @@ -70,48 +70,63 @@ function _fill_range_for_accelerated_join!(ranges, starts, loc, x, f, sz, chunk; end end # TODO how the hashing behave for Categorical Arrays? -function _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate, chunk = 2^10; nsfpaj = true, threads = true) +function _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate, chunk = 2^10; nsfpaj=nsfpaj, threads = true) + # nsfpaj has no value by default to make sure caller passes it + # we use vector to represent nsfpaj, because we may override its value + nsfpaj_in = nsfpaj[1] + if isempty(dsr) idx = [] fill!(ranges, 1:nrow(dsr)) last_valid_range = -1 else - if accelerate - if mapformats[2] - _fr = getformat(dsr, oncols_right[1]) - else - _fr = identity - end - grng = _divide_for_fast_join(_columns(dsr)[oncols_right[1]], _fr, chunk; threads = threads) - if mapformats[1] - _fl = getformat(dsl, oncols_left[1]) - else - _fl = identity - end - _fill_range_for_accelerated_join!(ranges, grng.starts, grng.starts_loc, _columns(dsl)[oncols_left[1]], _fl, nrow(dsr), chunk; threads = threads) - if dsr isa SubDataset - starts, idx, last_valid_range = _sortperm_v(dsr, oncols_right, stable = stable, a = alg, mapformats = mapformats[2], notsortpaforjoin = nsfpaj, givenrange = grng, threads = threads) - - else - starts, idx, last_valid_range = _sortperm(dsr, oncols_right, stable = stable, a = alg, mapformats = mapformats[2], notsortpaforjoin = nsfpaj, givenrange = grng, threads = threads) - end + # check if data already sorted, if so it overrides accelerate + if _check_for_fast_sort(dsr, oncols_right, fill(false, length(oncols_right)), mapformats[2]; notsortpaforjoin = false, givenrange = nothing) == 0 + # if it is already sorted based on what we want we can saftly change nsfpaj to false + nsfpaj[1] = false + idx = 1:nrow(dsr) + last_valid_range = _ngroups(dsr) + fill!(ranges, 1:nrow(dsr)) else - if dsr isa SubDataset - starts, idx, last_valid_range = _sortperm_v(dsr, oncols_right, stable = stable, a = alg, mapformats = mapformats[2], notsortpaforjoin = nsfpaj, threads = threads) + + if accelerate + if mapformats[2] + _fr = getformat(dsr, oncols_right[1]) + else + _fr = identity + end + grng = _divide_for_fast_join(_columns(dsr)[oncols_right[1]], _fr, chunk; threads = threads) + if mapformats[1] + _fl = getformat(dsl, oncols_left[1]) + else + _fl = identity + end + _fill_range_for_accelerated_join!(ranges, grng.starts, grng.starts_loc, _columns(dsl)[oncols_left[1]], _fl, nrow(dsr), chunk; threads = threads) + if dsr isa SubDataset + starts, idx, last_valid_range = _sortperm_v(dsr, oncols_right, stable = stable, a = alg, mapformats = mapformats[2], notsortpaforjoin = nsfpaj_in, givenrange = grng, threads = threads) + + else + starts, idx, last_valid_range = _sortperm(dsr, oncols_right, stable = stable, a = alg, mapformats = mapformats[2], notsortpaforjoin = nsfpaj_in, givenrange = grng, threads = threads) + end else - starts, idx, last_valid_range = _sortperm(dsr, oncols_right, stable = stable, a = alg, mapformats = mapformats[2], notsortpaforjoin = nsfpaj, threads = threads) + if dsr isa SubDataset + starts, idx, last_valid_range = _sortperm_v(dsr, oncols_right, stable = stable, a = alg, mapformats = mapformats[2], notsortpaforjoin = nsfpaj_in, threads = threads) + else + starts, idx, last_valid_range = _sortperm(dsr, oncols_right, stable = stable, a = alg, mapformats = mapformats[2], notsortpaforjoin = nsfpaj_in, threads = threads) + end + fill!(ranges, 1:nrow(dsr)) end - fill!(ranges, 1:nrow(dsr)) end end idx, last_valid_range == length(idx) end function _sort_for_join_after_hash(dsr, oncols_right, stable, alg, mapformats, nsfpaj, grng; threads = true) + nsfpaj_in = nsfpaj[1] if dsr isa SubDataset - starts, idx, last_valid_range = _sortperm_v(dsr, oncols_right, stable = stable, a = alg, mapformats = mapformats[2], notsortpaforjoin = nsfpaj, givenrange = grng, threads = threads) + starts, idx, last_valid_range = _sortperm_v(dsr, oncols_right, stable = stable, a = alg, mapformats = mapformats[2], notsortpaforjoin = nsfpaj_in, givenrange = grng, threads = threads) else - starts, idx, last_valid_range = _sortperm(dsr, oncols_right, stable = stable, a = alg, mapformats = mapformats[2], notsortpaforjoin = nsfpaj, givenrange = grng, threads = threads) + starts, idx, last_valid_range = _sortperm(dsr, oncols_right, stable = stable, a = alg, mapformats = mapformats[2], notsortpaforjoin = nsfpaj_in, givenrange = grng, threads = threads) end end @@ -423,7 +438,8 @@ function _mark_lt_part!(inbits, x_l, x_r, _fl::F1, _fr::F2, ranges, r_perms, en, our_cumsum!(revised_ends) end -function _change_refpool_find_range_for_join!(ranges, dsl, dsr, r_perms, oncols_left, oncols_right, lmf, rmf, j; type = :both, nsfpaj = true, threads = true) +function _change_refpool_find_range_for_join!(ranges, dsl, dsr, r_perms, oncols_left, oncols_right, lmf, rmf, j; type = :both, nsfpaj=nsfpaj, threads = true) + nsfpaj_in = nsfpaj[1] var_l = _columns(dsl)[oncols_left[j]] var_r = _columns(dsr)[oncols_right[j]] l_idx = oncols_left[j] @@ -441,7 +457,7 @@ function _change_refpool_find_range_for_join!(ranges, dsl, dsr, r_perms, oncols_ T1 = Core.Compiler.return_type(DataAPI.unwrap∘_fl, Tuple{eltype(var_l)}) - if DataAPI.refpool(var_r) !== nothing && nsfpaj + if DataAPI.refpool(var_r) !== nothing && nsfpaj_in # sort taken care for refs ordering of modified values, but we still need to change refs if _fr == identity var_r_cpy = var_r @@ -463,6 +479,7 @@ end function _join_left(dsl, dsr, ::Val{T}; onleft, onright, makeunique = false, mapformats = [true, true], stable = false, alg = HeapSort, check = true, accelerate = false, method = :sort, threads = true, multiple_match::Bool = false, multiple_match_name = :multiple, obs_id = [false, false], obs_id_name = :obs_id) where T isempty(dsl) && return copy(dsl) + nsfpaj = [true] if method == :hash ranges, a, idx, minval, reps, sz, right_cols = _find_ranges_for_join_using_hash(dsl, dsr, onleft, onright, mapformats, makeunique, Val(T); threads = threads) elseif method == :sort @@ -480,10 +497,10 @@ function _join_left(dsl, dsr, ::Val{T}; onleft, onright, makeunique = false, map return result end end - idx, uniquemode = _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate; threads = threads) + idx, uniquemode = _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate;nsfpaj = nsfpaj, threads = threads) for j in 1:length(oncols_left) - _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], j; threads = threads) + _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], j; nsfpaj = nsfpaj, threads = threads) end end new_ends = map(x -> max(1, length(x)), ranges) @@ -553,6 +570,7 @@ end function _join_left!(dsl::Dataset, dsr::AbstractDataset, ::Val{T}; onleft, onright, makeunique = false, mapformats = [true, true], stable = false, alg = HeapSort, check = true, accelerate = false, method = :sort, threads = true, multiple_match = false, multiple_match_name = :multiple, obs_id = [false, false], obs_id_name = :obs_id) where T isempty(dsl) && return dsl + nsfpaj = [true] if method == :hash ranges, a, idx, minval, reps, sz, right_cols = _find_ranges_for_join_using_hash(dsl, dsr, onleft, onright, mapformats, makeunique, Val(T); threads = threads) elseif method == :sort @@ -569,9 +587,9 @@ function _join_left!(dsl::Dataset, dsr::AbstractDataset, ::Val{T}; onleft, onrig return result end end - idx, uniquemode = _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate, threads = threads) + idx, uniquemode = _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate, nsfpaj = nsfpaj, threads = threads) for j in 1:length(oncols_left) - _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], j, threads = threads) + _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], j, nsfpaj = nsfpaj, threads = threads) end end if !all(x->length(x) <= 1, ranges) @@ -660,11 +678,11 @@ function _join_inner(dsl, dsr::AbstractDataset, ::Val{T}; onleft, onright, onrig throw(ArgumentError("duplicate column names, pass `makeunique = true` to make them unique using a suffix automatically." )) end - nsfpaj = true + nsfpaj = [true] # if the columns for inequality like join are PA we cannot use the fast path if type != :both if any(i-> DataAPI.refpool(_columns(dsr)[i]) !== nothing, right_range_cols) - nsfpaj = false + nsfpaj = [false] end end # if (onright_range === nothing || length(onleft) > 1) is false, then we have inequality kind join with no exact match join @@ -689,7 +707,7 @@ function _join_inner(dsl, dsr::AbstractDataset, ::Val{T}; onleft, onright, onrig return result end end - idx, uniquemode = _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate && (onright_range == nothing || length(oncols_right)>1); nsfpaj = nsfpaj, threads = threads) + idx, uniquemode = _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate && (onright_range === nothing || length(oncols_right)>1); nsfpaj = nsfpaj, threads = threads) for j in 1:length(oncols_left)-1 _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], j; nsfpaj = nsfpaj, threads = threads) @@ -784,7 +802,7 @@ function _in(dsl::AbstractDataset, dsr::AbstractDataset, ::Val{T}; onleft, onrig isempty(dsl) && return Bool[] oncols_left = onleft oncols_right = onright - + nsfpaj = [true] # use Set when there is only one column in `on` if length(oncols_right) == 1 if mapformats[1] @@ -800,9 +818,9 @@ function _in(dsl::AbstractDataset, dsr::AbstractDataset, ::Val{T}; onleft, onrig return _in_use_Set(_columns(dsl)[oncols_left[1]], _columns(dsr)[oncols_right[1]], _fl, _fr, threads = threads) end ranges = Vector{UnitRange{T}}(undef, nrow(dsl)) - idx, uniquemode = _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate, threads = threads) + idx, uniquemode = _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate, nsfpaj = nsfpaj, threads = threads) for j in 1:length(oncols_left) - _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], j, threads = threads) + _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], j, nsfpaj = nsfpaj, threads = threads) end map(x -> length(x) == 0 ? false : true, ranges) end @@ -875,6 +893,7 @@ function _join_outer(dsl, dsr::AbstractDataset, ::Val{T}; onleft, onright, makeu (isempty(dsl) || isempty(dsr)) && throw(ArgumentError("in `outerjoin` both left and right tables must be non-empty")) oncols_left = onleft oncols_right = onright + nsfpaj = [true] if method == :hash ranges, a, idx, minval, reps, sz, right_cols = _find_ranges_for_join_using_hash(dsl, dsr, onleft, onright, mapformats, makeunique, Val(T); threads = threads) elseif method == :sort @@ -889,9 +908,9 @@ function _join_outer(dsl, dsr::AbstractDataset, ::Val{T}; onleft, onright, makeu return result end end - idx, uniquemode = _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate, threads = threads) + idx, uniquemode = _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate, nsfpaj = nsfpaj, threads = threads) for j in 1:length(oncols_left) - _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], j, threads = threads) + _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], j, nsfpaj = nsfpaj, threads = threads) end end new_ends = map(x -> max(1, length(x)), ranges) diff --git a/src/join/join_dict.jl b/src/join/join_dict.jl index e6a1901b..eb1f7826 100644 --- a/src/join/join_dict.jl +++ b/src/join/join_dict.jl @@ -60,7 +60,7 @@ function _create_dictionary_for_join(f, v, fl, vl, ::Val{T}) where T maxval = hp_maximum(DataAPI.refarray(v)) rangelen = maxval - minval + 1 _create_dictionary_for_join_int(identity, DataAPI.refarray(v), minval, rangelen, Val(T)) - elseif nonmissingtype(return_type(f, v)) <: AbstractVector{<:Union{Missing, INTEGERS}} && nonmissingtype(return_type(fl, vl)) <: AbstractVector{<:Union{Missing, INTEGERS}} + elseif our_nonmissingtype(return_type(f, v)) <: AbstractVector{<:Union{Missing, INTEGERS}} && our_nonmissingtype(return_type(fl, vl)) <: AbstractVector{<:Union{Missing, INTEGERS}} minval = hp_minimum(f, v) # if minval is missing all values are missing if ismissing(minval) @@ -531,8 +531,8 @@ function _update!_dict(dsl, dsr, ranges, onleft, onright, right_cols, ::Val{T}; for j in 1:length(right_cols) if haskey(index(dsl).lookup, _names(dsr)[right_cols[j]]) left_cols_idx = index(dsl)[_names(dsr)[right_cols[j]]] - TL = nonmissingtype(eltype(_columns(dsl)[left_cols_idx])) - TR = nonmissingtype(eltype(_columns(dsr)[right_cols[j]])) + TL = our_nonmissingtype(eltype(_columns(dsl)[left_cols_idx])) + TR = our_nonmissingtype(eltype(_columns(dsr)[right_cols[j]])) if promote_type(TR, TL) <: TL _update_left_with_right!(_columns(dsl)[left_cols_idx], _columns(dsr)[right_cols[j]], ranges, allowmissing, f_mode, threads = threads, op = op) end diff --git a/src/join/update.jl b/src/join/update.jl index dfd155fc..355e04d9 100644 --- a/src/join/update.jl +++ b/src/join/update.jl @@ -28,6 +28,7 @@ end function _update!(dsl::Dataset, dsr::AbstractDataset, ::Val{T}; onleft, onright, check = true, allowmissing = true, mode = :all, mapformats = [true, true], stable = false, alg = HeapSort, accelerate = false, usehash = true, method = :sort, threads = true, op = nothing) where T isempty(dsl) && return dsl + nsfpaj = [true] if method == :hash ranges, a, idx, minval, reps, sz, right_cols = _find_ranges_for_join_using_hash(dsl, dsr, onleft, onright, mapformats, true, Val(T); threads = threads) elseif method == :sort @@ -42,10 +43,10 @@ function _update!(dsl::Dataset, dsr::AbstractDataset, ::Val{T}; onleft, onright, return result end end - idx, uniquemode = _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate, threads = threads) + idx, uniquemode = _find_permute_and_fill_range_for_join!(ranges, dsr, dsl, oncols_right, oncols_left, stable, alg, mapformats, accelerate, nsfpaj=nsfpaj, threads = threads) for j in 1:length(oncols_left) - _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], j, threads = threads) + _change_refpool_find_range_for_join!(ranges, dsl, dsr, idx, oncols_left, oncols_right, mapformats[1], mapformats[2], j, nsfpaj = nsfpaj, threads = threads) end end @@ -59,8 +60,8 @@ function _update!(dsl::Dataset, dsr::AbstractDataset, ::Val{T}; onleft, onright, for j in 1:length(right_cols) if haskey(index(dsl).lookup, _names(dsr)[right_cols[j]]) left_cols_idx = index(dsl)[_names(dsr)[right_cols[j]]] - TL = nonmissingtype(eltype(_columns(dsl)[left_cols_idx])) - TR = nonmissingtype(eltype(_columns(dsr)[right_cols[j]])) + TL = our_nonmissingtype(eltype(_columns(dsl)[left_cols_idx])) + TR = our_nonmissingtype(eltype(_columns(dsr)[right_cols[j]])) if promote_type(TR, TL) <: TL _update_left_with_right!(_columns(dsl)[left_cols_idx], view(_columns(dsr)[right_cols[j]], idx), ranges, allowmissing, f_mode, threads = threads, op = op) end diff --git a/src/other/broadcasting.jl b/src/other/broadcasting.jl index b91b281a..958f57fc 100644 --- a/src/other/broadcasting.jl +++ b/src/other/broadcasting.jl @@ -226,7 +226,7 @@ function Base.Broadcast.broadcast_unalias(dest, src::AbstractDataset) if src isa SubDataset if !wascopied src = SubDataset(_our_copy(parent(src), copycols=false), - index(src), rows(src)) + index(src), rows(src), _get_lastmodified(_attributes(parent(src)))) end parentidx = parentcols(index(src), i) parent(src)[!, parentidx] = Base.unaliascopy(_columns(parent(src))[parentidx]) @@ -254,7 +254,7 @@ function _broadcast_unalias_helper(dest::AbstractDataset, scol::AbstractVector, if src isa SubDataset if !wascopied src =SubDataset(_our_copy(parent(src), copycols=false), - index(src), rows(src)) + index(src), rows(src), _get_lastmodified(_attributes(parent(src)))) end parentidx = parentcols(index(src), col2) parent(src)[!, parentidx] = Base.unaliascopy(_columns(parent(src))[parentidx]) diff --git a/src/other/utils.jl b/src/other/utils.jl index c6306412..fc8d14df 100644 --- a/src/other/utils.jl +++ b/src/other/utils.jl @@ -1,6 +1,16 @@ const INTEGERS = Union{Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64, Bool} const FLOATS = Union{Float16, Float32, Float64} +function our_nonmissingtype(x) + T = nonmissingtype(x) + if T === Union{} + Missing + else + T + end +end + + # work around slow allocation of type union in julia function _our_vect_alloc(T, len) if len > 0 @@ -34,7 +44,7 @@ function return_type(f::Function, x) if eltype(x) <: AbstractVector return return_type_tuple(f, x) end - CT = nonmissingtype(eltype(x)) + CT = our_nonmissingtype(eltype(x)) T = Core.Compiler.return_type(f, Tuple{Vector{CT}}) # workaround for SubArray type if T <: SubArray @@ -50,7 +60,7 @@ function return_type(f::Function, x) end function return_type_tuple(f::Function, x) - CT = ntuple(i -> nonmissingtype(eltype(x[i])), length(x)) + CT = ntuple(i -> our_nonmissingtype(eltype(x[i])), length(x)) T = Core.Compiler.return_type(f, Tuple{ntuple(i->Vector{CT[i]}, length(x))...}) # workaround for SubArray type if T <: SubArray @@ -420,7 +430,7 @@ function _gather_groups(ds, cols, ::Val{T}; mapformats = false, stable = true, t _max_level = nrow(ds) - if nrow(ds) > 2^23 && !stable && 5 2^23 && !stable && 5_grabrefs(_columns(ds)[colidx[i]]), length(colidx)) - create_dict_hugeds_multicols(colsvals, rhashes, Val(T)) + if threads + rngs, sz = _gather_groups_hugeds_splitter(rhashes, Val(T)) + groups = Vector{T}(undef, length(rhashes)) + ngroups_all = _gather_groups_hugeds_collector(groups, rngs, sz, rhashes, colsvals, Val(T)) + ngroups = _gather_groups_hugeds_cleanup!(groups, ngroups_all, rngs, sz) + else + groups = Vector{T}(undef, length(rhashes)) + rng = 1:length(rhashes) + ngroups = create_dict_hugeds_multicols!(groups, rng, colsvals, rhashes, Val(T)) + end + groups, T[], ngroups +end + +# TODO what happen if the values are not randomly grouped based on cols +function _gather_groups_hugeds_splitter(rhashes, ::Val{T}) where T + nt = 997 # TODO this should be an argument, however, we must be careful that this value doesn't degrade actual dictionary creation in Subsequent steps + sz = zeros(T, nt) + # It is safe to record _ids - memory will be released and it does not add extra memory to the total amount (we later need to allocate groups) + _id = Vector{Int16}(undef, length(rhashes)) + for i in eachindex(rhashes) + _id[i] = (rhashes[i] % nt)+1 + sz[_id[i]] += 1 + end + rngs = Vector{T}(undef, length(rhashes)) + prepend!(sz, T(0)) + our_cumsum!(sz) + sz_cp = copy(sz) + + for i in eachindex(rhashes) + idx=_id[i] + sz_cp[idx] += 1 + rngs[sz_cp[idx]] = i + end + rngs, sz +end + +function _gather_groups_hugeds_collector(groups, rngs, sz, rhashes, colsvals, ::Val{T}) where T + ngroups = Vector{Int}(undef, length(sz)-1) + Threads.@threads for i in 2:length(sz) + hi = sz[i] + lo = sz[i-1]+1 + _tmp = view(groups, view(rngs, lo:hi)) + ngroups[i-1] = create_dict_hugeds_multicols!(_tmp, view(rngs, lo:hi), colsvals, rhashes, Val(T)) + end + ngroups +end + +function _gather_groups_hugeds_cleanup!(groups, ngroups, rngs, sz) + our_cumsum!(ngroups) + Threads.@threads for i in 3:length(sz) + hi=sz[i] + lo=sz[i-1]+1 + for j in lo:hi + groups[rngs[j]] += ngroups[i-2] + end + end + return ngroups[end] end -function create_dict_hugeds_multicols(colvals, rhashes, ::Val{T}) where T - sz = max(1 + ((5 * length(rhashes)) >> 2), 16) +# groups is a list of integeres for which the dict is going to be created +# get index and set index should sometimes be adjusted based on rng +# make sure groups is a vector{T} +function create_dict_hugeds_multicols!(groups, rng, colvals, rhashes, ::Val{T}) where T + isempty(rng) && return 0 + sz = max(1 + ((5 * length(groups)) >> 2), 16) sz = 1 << (8 * sizeof(sz) - leading_zeros(sz - 1)) - @assert 4 * sz >= 5 * length(rhashes) + @assert 4 * sz >= 5 * length(groups) szm1 = sz-1 gslots = zeros(T, sz) - groups = Vector{T}(undef, length(rhashes)) ngroups = 0 - @inbounds for i in eachindex(rhashes) + @inbounds for i in eachindex(rng) # find the slot and group index for a row - slotix = rhashes[i] & szm1 + 1 + slotix = rhashes[rng[i]] & szm1 + 1 gix = -1 probe = 0 while true @@ -570,8 +639,8 @@ function create_dict_hugeds_multicols(colvals, rhashes, ::Val{T}) where T gslots[slotix] = i gix = ngroups += 1 break - elseif rhashes[i] == rhashes[g_row] # occupied slot, check if miss or hit - if isequal_row(colvals, i, Int(g_row)) # hit + elseif rhashes[rng[i]] == rhashes[rng[g_row]] # occupied slot, check if miss or hit + if isequal_row(colvals, Int(rng[i]), Int(rng[g_row])) # hit gix = groups[g_row] break end @@ -580,9 +649,10 @@ function create_dict_hugeds_multicols(colvals, rhashes, ::Val{T}) where T probe += 1 @assert probe < sz end + # groups[i] has done its work we can modify it groups[i] = gix end - return groups, gslots, ngroups + return ngroups end diff --git a/src/precompile/precompile.jl b/src/precompile/precompile.jl index e1f03ce5..51f62c76 100644 --- a/src/precompile/precompile.jl +++ b/src/precompile/precompile.jl @@ -546,6 +546,6 @@ function _precompile() Base.precompile(Tuple{Core.kwftype(typeof(transpose)),NamedTuple{(:id, :threads), Tuple{Symbol, Bool}},typeof(transpose),Dataset,Vector{Symbol}}) Base.precompile(Tuple{Core.kwftype(typeof(transpose)),NamedTuple{(:threads,), Tuple{Bool}},typeof(transpose),Dataset,UnitRange{Int64}}) Base.precompile(Tuple{Core.kwftype(typeof(transpose)),NamedTuple{(:threads,), Tuple{Bool}},typeof(transpose),GroupBy,Vector{Int64}}) - + VERSION >= v"1.9" && IMD.warmup() return nothing end \ No newline at end of file diff --git a/src/precompile/warmup.jl b/src/precompile/warmup.jl index 10982d68..99f8e1a6 100644 --- a/src/precompile/warmup.jl +++ b/src/precompile/warmup.jl @@ -173,7 +173,8 @@ function warmup() findall(duplicates(ds, :a, mapformats = true)) == 2:12 unique(ds) == ds1 unique(ds, 2:3) == ds1 - + ds = Dataset(x=[rand(10) for _ in 1:100]) + flatten!(ds, 1) t2 = now() Dataset(x1 = "Finished warmup in", x2 = t2-t1) end diff --git a/src/sort/gatherby.jl b/src/sort/gatherby.jl index 46259219..c914b740 100644 --- a/src/sort/gatherby.jl +++ b/src/sort/gatherby.jl @@ -115,8 +115,6 @@ function compute_indices(groups, ngroups, ::Val{T}; threads = true) where T idx, starts end -# fast combine for gatherby data - mutable struct GatherBy parent groupcols @@ -125,8 +123,12 @@ mutable struct GatherBy mapformats::Bool perm starts + created::DateTime +end +function Base.copy(gds::GatherBy) + ds_cpy = copy(gds.parent) + GatherBy(copy(gds.parent), copy(gds.groupcols), copy(gds.groups), gds.lastvalid, gds.mapformats, gds.perm === nothing ? nothing : copy(gds.perm), gds.starts === nothing ? nothing : copy(gds.starts), _get_lastmodified(_attributes(ds_cpy))) end -Base.copy(gds::GatherBy) = GatherBy(copy(gds.parent), copy(gds.groupcols), copy(gds.groups), gds.lastvalid, gds.mapformats, gds.perm === nothing ? nothing : copy(gds.perm), gds.starts === nothing ? nothing : copy(gds.starts)) nrow(ds::GatherBy) = nrow(ds.parent) @@ -148,6 +150,7 @@ Base.summary(gds::GatherBy) = function Base.show(io::IO, gds::GatherBy; kwargs...) + _check_consistency(gds) if length(_get_perms(gds)) > 200 _show(io, view(gds.parent, [first(gds.perm, 100);last(gds.perm, 100)], :); title = summary(gds), show_omitted_cell_summary=false, show_row_number = false, kwargs...) else @@ -176,57 +179,43 @@ end function gatherby(ds::AbstractDataset, cols::MultiColumnIndex; mapformats::Bool = true, stable::Bool = true, isgathered::Bool = false, eachrow::Bool = false, threads = true) colsidx = index(ds)[cols] if isempty(ds) - return GatherBy(ds, colsidx, Int[], 0, mapformats, nothing, nothing) + return GatherBy(ds, colsidx, Int[], 0, mapformats, nothing, nothing, _get_lastmodified(_attributes(ds))) end T = nrow(ds) < typemax(Int32) ? Int32 : Int64 _check_consistency(ds) if isgathered if eachrow - return GatherBy(ds, colsidx, 1:nrow(ds), nrow(ds), mapformats, 1:nrow(ds), 1:nrow(ds)) + return GatherBy(ds, colsidx, 1:nrow(ds), nrow(ds), mapformats, 1:nrow(ds), 1:nrow(ds), _get_lastmodified(_attributes(ds))) else colindex, ranges, last_valid_index = _find_starts_of_groups(ds, colsidx, Val(T); mapformats = mapformats, threads = threads) groups = Vector{T}(undef, nrow(ds)) _group_creator!(groups, ranges, last_valid_index) - return GatherBy(ds, colindex, groups, last_valid_index, mapformats, 1:nrow(ds), ranges) + return GatherBy(ds, colindex, groups, last_valid_index, mapformats, 1:nrow(ds), ranges, _get_lastmodified(_attributes(ds))) end else if eachrow a = _gather_groups(ds, colsidx, Val(T), mapformats = mapformats, stable = stable, threads = threads) b = compute_indices(a[1], a[3], nrow(ds) < typemax(Int32) ? Val(Int32) : Val(Int64); threads = threads) - return GatherBy(ds, colsidx, 1:nrow(ds), nrow(ds), mapformats, b[1], 1:nrow(ds)) + return GatherBy(ds, colsidx, 1:nrow(ds), nrow(ds), mapformats, b[1], 1:nrow(ds), _get_lastmodified(_attributes(ds))) else a = _gather_groups(ds, colsidx, Val(T), mapformats = mapformats, stable = stable, threads = threads) - return GatherBy(ds, colsidx, a[1], a[3], mapformats, nothing, nothing) + return GatherBy(ds, colsidx, a[1], a[3], mapformats, nothing, nothing, _get_lastmodified(_attributes(ds))) end end end gatherby(ds::AbstractDataset, col::ColumnIndex; mapformats = true, stable = true, isgathered = false, eachrow = false, threads = true) = gatherby(ds, [col], mapformats = mapformats, stable = stable, isgathered = isgathered, eachrow = eachrow, threads = threads) - -__SPFRMT(x) = x & 1023 -__SPFRMT(::Missing) = missing # not needed - -# currently not been used in gatherby -# use sort and format trick for fast gatherby - hm stands for high memory footprint -function hm_gatherby(ds::AbstractDataset, cols::MultiColumnIndex; mapformats = false, threads = true) - modify!(ds, cols=>byrow(hash; threads = threads, mapformats = mapformats)=>:___tmp___cols8934, :___tmp___cols8934=>identity=>:___tmp___cols8934_2) - setformat!(ds, :___tmp___cols8934_2=>__SPFRMT) - gds = groupby(ds, [:___tmp___cols8934_2, :___tmp___cols8934], stable = false, threads = threads) - grpcols, ranges, last_valid_index = _find_starts_of_groups(view(ds, gds.perm, cols), cols, nrow(ds) < typemax(Int32) ? Val(Int32) : Val(Int64); mapformats = mapformats, threads = threads) - select!(ds, Not([:___tmp___cols8934, :___tmp___cols8934_2])) - GatherBy(ds, grpcols, nothing, last_valid_index, mapformats, gds.perm, ranges) -end - function _fill_mapreduce_col!(x, f, op, y, loc) @inbounds for i in 1:length(y) x[loc[i]] = op(x[loc[i]], f(y[i])) end end -function _fill_mapreduce_col!(x, f::Vector, op, y, loc) +# only for calculating var - mval is a vector of means +function _fill_mapreduce_col!(x, mval::AbstractVector, op, y, loc) @inbounds for i in 1:length(y) - x[loc[i]] = op(x[loc[i]], f[loc[i]](y[i])) + x[loc[i]] = op(x[loc[i]], _abs2mean(y[i], mval[loc[i]])) end end @@ -242,11 +231,12 @@ function _fill_mapreduce_col_threaded!(x, f, op, y, loc, nt) end end -function _fill_mapreduce_col_threaded!(x, f::Vector, op, y, loc, nt) +# only for calculating var - mval is a vector of means +function _fill_mapreduce_col_threaded!(x, mval::AbstractVector, op, y, loc, nt) @sync for thid in 0:nt-1 Threads.@spawn for i in 1:length(y) @inbounds if loc[i] % nt == thid - x[loc[i]] = op(x[loc[i]], f[loc[i]](y[i])) + x[loc[i]] = op(x[loc[i]], _abs2mean(y[i], mval[loc[i]])) end end end @@ -258,8 +248,8 @@ end function gatherby_mapreduce(gds::GatherBy, f, op, col::ColumnIndex, nt, init, ::Val{T}; promotetypes = false, threads = true) where T CT = T if promotetypes - T <: Base.SmallSigned ? CT = Int : nothing - T <: Base.SmallUnsigned ? CT = UInt : nothing + T <: SMALLSIGNED ? CT = Int : nothing + T <: SMALLUNSIGNED ? CT = UInt : nothing end res = allocatecol(Union{CT, Missing}, gds.lastvalid) fill!(res, init) @@ -271,8 +261,8 @@ function gatherby_mapreduce(gds::GatherBy, f, op, col::ColumnIndex, nt, init, :: res end -_gatherby_maximum(gds, col; f = identity, nt = Threads.nthreads(), threads = true) = gatherby_mapreduce(gds, f, _stat_max_fun, col, nt, missing, Val(nonmissingtype(eltype(gds.parent[!, col]))), threads = threads) -_gatherby_minimum(gds, col; f = identity, nt = Threads.nthreads(), threads = true) = gatherby_mapreduce(gds, f, _stat_min_fun, col, nt, missing, Val(nonmissingtype(eltype(gds.parent[!, col]))), threads = threads) +_gatherby_maximum(gds, col; f = identity, nt = Threads.nthreads(), threads = true) = gatherby_mapreduce(gds, f, _stat_max_fun, col, nt, missing, Val(our_nonmissingtype(eltype(gds.parent[!, col]))), threads = threads) +_gatherby_minimum(gds, col; f = identity, nt = Threads.nthreads(), threads = true) = gatherby_mapreduce(gds, f, _stat_min_fun, col, nt, missing, Val(our_nonmissingtype(eltype(gds.parent[!, col]))), threads = threads) _gatherby_sum(gds, col; f = identity, nt = Threads.nthreads(), threads = true) = gatherby_mapreduce(gds, f, _stat_add_sum, col, nt, missing, Val(typeof(zero(Core.Compiler.return_type(f, Tuple{eltype(gds.parent[!, col])})))), promotetypes = true, threads = threads) _gatherby_n(gds, col; nt = Threads.nthreads(), threads = true) = _gatherby_sum(gds, col, f = _stat_notmissing, nt = nt, threads = threads) _gatherby_length(gds, col; nt = Threads.nthreads(), threads = true) = _gatherby_sum(gds, col, f = x->1, nt = nt, threads = threads) @@ -306,7 +296,7 @@ function _gatherby_mean(gds, col; nt = Threads.nthreads(), threads = true) nval = t2 end - T = Core.Compiler.return_type(/, Tuple{nonmissingtype(eltype(sval)), nonmissingtype(eltype(nval))}) + T = Core.Compiler.return_type(/, Tuple{our_nonmissingtype(eltype(sval)), our_nonmissingtype(eltype(nval))}) res = _our_vect_alloc(Union{Missing, T}, length(nval)) _fill_gatherby_mean_barrier!(res, sval, nval) res @@ -340,6 +330,7 @@ function _fill_gatherby_var_barrier!(res, countnan, meanval, ss, nval, cal_std, end # TODO directly calculating var should be a better approach +_abs2mean(x, meanval) = abs2(x - meanval) function _gatherby_var(gds, col; dof = true, cal_std = false, threads = true) if threads nt = Threads.nthreads() @@ -347,7 +338,7 @@ function _gatherby_var(gds, col; dof = true, cal_std = false, threads = true) t1 = Threads.@spawn _gatherby_cntnan(gds, col, nt = nt2) t2 = Threads.@spawn _gatherby_mean(gds, col, nt = nt2) meanval = fetch(t2) - t3 = Threads.@spawn gatherby_mapreduce(gds, [x->abs2(x - meanval[i]) for i in 1:length(meanval)], _stat_add_sum, col, nt2, missing, Val(Float64)) + t3 = Threads.@spawn gatherby_mapreduce(gds, meanval, _stat_add_sum, col, nt2, missing, Val(Float64)) t4 = Threads.@spawn _gatherby_n(gds, col, nt = nt2) countnan = fetch(t1) ss = fetch(t3) @@ -356,13 +347,13 @@ function _gatherby_var(gds, col; dof = true, cal_std = false, threads = true) t1 = _gatherby_cntnan(gds, col, threads = threads) t2 = _gatherby_mean(gds, col, threads = threads) meanval = t2 - t3 = gatherby_mapreduce(gds, [x->abs2(x - meanval[i]) for i in 1:length(meanval)], _stat_add_sum, col, Threads.nthreads(), missing, Val(Float64), threads = threads) + t3 = gatherby_mapreduce(gds, meanval, _stat_add_sum, col, Threads.nthreads(), missing, Val(Float64), threads = threads) t4 = _gatherby_n(gds, col, threads = threads) countnan = t1 ss = t3 nval = t4 end - T = Core.Compiler.return_type(/, Tuple{nonmissingtype(eltype(meanval)), nonmissingtype(eltype(nval))}) + T = Core.Compiler.return_type(/, Tuple{our_nonmissingtype(eltype(meanval)), our_nonmissingtype(eltype(nval))}) res = _our_vect_alloc(Union{Missing, T}, length(nval)) _fill_gatherby_var_barrier!(res, countnan, meanval, ss, nval, cal_std, dof) res @@ -375,7 +366,7 @@ const FAST_GATHERBY_REDUCTION = [sum, length, minimum, maximum, mean, var, std, function _fast_gatherby_reduction(gds, ms) !(gds isa GatherBy) && return false - gds.groups == nothing && return false + gds.groups === nothing && return false for i in 1:length(ms) if (ms[i].second.first isa Expr) && ms[i].second.first.head == :BYROW elseif (ms[i].second.first isa Base.Callable) diff --git a/src/sort/groupby.jl b/src/sort/groupby.jl index 1d80dc59..5ee85abb 100644 --- a/src/sort/groupby.jl +++ b/src/sort/groupby.jl @@ -12,7 +12,7 @@ Return a `GroupBy` representing a view of a `sorted` data set which each group o - `ds` : an `AbstractDataset` or the output of `groupby`. - `cols` : data set columns to group by. Can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). -- `alg` : The sorting algorithm for creating `grouped` data set. By default Heap algorithm is used, however, user can pass `Quicksort` too. +- `alg` : The sorting algorithm for creating `grouped` data set. By default Heap algorithm is used, however, user can pass `QuickSort` too. - `rev` : A `Bool` value or a Vector of `Bool` which indicate which column should be sorted in descending order. - `mapforamts`: Whether the formated values should be used or not. - `stable`: Whether the sorting alogrithm should be stable or not. Setting this to `false` often improve the performance. @@ -97,7 +97,7 @@ Repace a data set by its sorted version and tag the data set as a grouped data s - `ds` : a `Dataset`. - `cols` : data set columns to group by. Can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). -- `alg` : The sorting algorithm for creating `grouped` data set. By default Heap algorithm is used, however, user can pass `Quicksort` too. +- `alg` : The sorting algorithm for creating `grouped` data set. By default Heap algorithm is used, however, user can pass `QuickSort` too. - `rev` : A `Bool` value or a Vector of `Bool` which indicate which column should be sorted in descending order. - `mapforamts`: Whether the formated values should be used or not. - `stable`: Whether the sorting alogrithm should be stable or not. Setting this to `false` often improve the performance. @@ -167,9 +167,13 @@ mutable struct GroupBy starts lastvalid mapformats::Bool + created::DateTime end -Base.copy(gds::GroupBy) = GroupBy(copy(gds.parent), copy(gds.groupcols), copy(gds.rev), copy(gds.perm), copy(gds.starts), gds.lastvalid, gds.mapformats) +function Base.copy(gds::GroupBy) + ds_cp = copy(gds.parent) + GroupBy(ds_cp, copy(gds.groupcols), copy(gds.rev), copy(gds.perm), copy(gds.starts), gds.lastvalid, gds.mapformats, _get_lastmodified(_attributes(ds_cp))) +end nrow(ds::GroupBy) = nrow(ds.parent) ncol(ds::GroupBy) = ncol(ds.parent) @@ -185,10 +189,10 @@ function groupby(ds::Dataset, cols::MultiColumnIndex; alg = HeapSortAlg(), rev = _check_consistency(ds) colsidx = index(ds)[cols] if isempty(ds) - return GroupBy(parent(ds), colsidx, rev, Int[], Int[], 0, mapformats) + return GroupBy(parent(ds), colsidx, rev, Int[], Int[], 0, mapformats, _get_lastmodified(_attributes(ds))) end a = _sortperm(ds, cols, rev, a = alg, mapformats = mapformats, stable = stable, threads = threads) - GroupBy(parent(ds),colsidx, rev, a[2], a[1], a[3], mapformats) + GroupBy(parent(ds),colsidx, rev, a[2], a[1], a[3], mapformats, _get_lastmodified(_attributes(ds))) end groupby(ds::Dataset, col::ColumnIndex; alg = HeapSortAlg(), rev = false, mapformats::Bool = true, stable = true, threads = true) = groupby(ds, [col], alg = alg, rev = rev, mapformats = mapformats, stable = stable, threads = threads) @@ -209,7 +213,7 @@ function groupby(ds::GroupBy, cols::MultiColumnIndex; alg = HeapSortAlg(), rev = colsidx = index(ds)[cols] grng = GIVENRANGE(copy(_get_perms(ds)),copy(_group_starts(ds)), nothing, _ngroups(ds)) a = _sortperm(ds, cols, rev, a = alg, mapformats = mapformats, stable = stable, givenrange = grng, skipcol = -1, threads = threads) - GroupBy(parent(ds),colsidx, rev, a[2], a[1], a[3], mapformats) + GroupBy(parent(ds),colsidx, rev, a[2], a[1], a[3], mapformats, _get_lastmodified(_attributes(parent(ds)))) end groupby(ds::GroupBy, col::ColumnIndex; alg = HeapSortAlg(), rev = false, mapformats::Bool = true, stable = true, threads = true) = groupby(ds, [col], alg = alg, rev = rev, mapformats = mapformats, stable = stable, threads = threads) @@ -233,6 +237,7 @@ end modify(origninal_gds::Union{GroupBy, GatherBy}, @nospecialize(args...); threads::Bool = true) = modify!(copy(origninal_gds), args..., threads = threads) function modify!(gds::Union{GroupBy, GatherBy}, @nospecialize(args...); threads::Bool = true) + _check_consistency(gds) if parent(gds) isa SubDataset idx_cpy = copy(index(parent(gds))) else @@ -315,6 +320,7 @@ end function combine(gds::Union{GroupBy, GatherBy}, @nospecialize(args...); dropgroupcols = false, threads = true) + _check_consistency(gds) idx_cpy::Index = Index(Dict{Symbol, Int}(), Symbol[], Dict{Int, Function}()) if !dropgroupcols for i in gds.groupcols @@ -441,6 +447,7 @@ Base.summary(gds::GroupBy) = function Base.show(io::IO, gds::GroupBy; kwargs...) + _check_consistency(gds) #TODO pretty_table is very slow for large views, temporary workaround, later we should fix this if length(gds.perm) > 200 _show(io, view(gds.parent, [first(gds.perm, 100);last(gds.perm, 100)], :); title = summary(gds), show_omitted_cell_summary=false, show_row_number = false, kwargs...) @@ -589,6 +596,15 @@ function groupby(ds::SubDataset, cols::MultiColumnIndex; alg = HeapSortAlg(), re _check_consistency(ds) colsidx = index(ds)[cols] a = _sortperm_v(ds, cols, rev, a = alg, mapformats = mapformats, stable = stable, threads = threads) - GroupBy(ds, colsidx, rev, a[2], a[1], a[3], mapformats) + GroupBy(ds, colsidx, rev, a[2], a[1], a[3], mapformats, _get_lastmodified(_attributes(ds))) end groupby(ds::SubDataset, col::ColumnIndex; alg = HeapSortAlg(), rev = false, mapformats::Bool = true, stable = true, threads = true) = groupby(ds, [col], alg = alg, rev = rev, mapformats = mapformats, stable = stable, threads = threads) + + +### check consistency of grouped data - GroupBy, GatherBy + +function _check_consistency(ds::Union{GroupBy, GatherBy}) + lmd=ds.created + lmp=_get_lastmodified(_attributes(parent(ds))) + @assert lmd == lmp "The parent data set which the grouped data set is based on has been modified. To fix the issue regroup data." +end \ No newline at end of file diff --git a/src/sort/int.jl b/src/sort/int.jl index d8617f76..681ca1b6 100644 --- a/src/sort/int.jl +++ b/src/sort/int.jl @@ -98,7 +98,7 @@ end function _sort_chunks_int_right!(x, idx::Vector{<:Integer}, idx_cpy, where, number_of_chunks, rangelen, minval, o::Ordering) cz = div(length(x), number_of_chunks) en = length(x) - Threads.@threads for i in 1:number_of_chunks + Threads.@threads :static for i in 1:number_of_chunks ds_sort_int_missatright!(x, idx, idx_cpy, where[Threads.threadid()], (i-1)*cz+1,i*cz, rangelen, minval) end # take care of the last few observations @@ -111,7 +111,7 @@ end function _sort_chunks_int_left!(x, idx::Vector{<:Integer}, idx_cpy, where, number_of_chunks, rangelen, minval, o::Ordering) cz = div(length(x), number_of_chunks) en = length(x) - Threads.@threads for i in 1:number_of_chunks + Threads.@threads :static for i in 1:number_of_chunks ds_sort_int_missatleft!(x, idx, idx_cpy, where[Threads.threadid()], (i-1)*cz+1,i*cz, rangelen, minval) end # take care of the last few observations @@ -262,7 +262,7 @@ function _ds_sort_int_missatright_nopermx_threaded!(x, original_P, copy_P, lo, h where[i][1] = 1 where[i][2] = 1 end - Threads.@threads for i = lo:hi + Threads.@threads :static for i = lo:hi @inbounds ismissing(x[i]) ? where[Threads.threadid()][rangelen+3] += 1 : where[Threads.threadid()][Int(x[i]) + offs + 2] += 1 end for j in 3:length(where[1]) @@ -306,7 +306,7 @@ function _ds_sort_int_missatright_nopermx_threaded!(x, original_P, rangelen, min where[i][1] = 1 where[i][2] = 1 end - Threads.@threads for i = 1:length(x) + Threads.@threads :static for i = 1:length(x) @inbounds ismissing(x[i]) ? where[Threads.threadid()][rangelen+3] += 1 : where[Threads.threadid()][Int(x[i]) + offs + 2] += 1 end for j in 3:length(where[1]) @@ -348,7 +348,7 @@ function _ds_sort_int_missatleft_nopermx_threaded!(x, original_P, copy_P, lo, hi where[i][1] = 1 where[i][2] = 1 end - Threads.@threads for i = lo:hi + Threads.@threads :static for i = lo:hi @inbounds ismissing(x[i]) ? where[Threads.threadid()][3] += 1 : where[Threads.threadid()][Int(x[i]) + offs + 3] += 1 end for j in 3:length(where[1]) @@ -392,7 +392,7 @@ function _ds_sort_int_missatleft_nopermx_threaded!(x, original_P, rangelen, minv where[i][1] = 1 where[i][2] = 1 end - Threads.@threads for i = 1:length(x) + Threads.@threads :static for i = 1:length(x) @inbounds ismissing(x[i]) ? where[Threads.threadid()][3] += 1 : where[Threads.threadid()][Int(x[i]) + offs + 3] += 1 end for j in 3:length(where[1]) diff --git a/src/sort/sort.jl b/src/sort/sort.jl index 82f14bd3..7e67766c 100644 --- a/src/sort/sort.jl +++ b/src/sort/sort.jl @@ -213,11 +213,21 @@ end function _issorted_check_for_each_range(v, starts, lastvalid, _ord, nrows; threads = true) part_res = ones(Bool, threads ? Threads.nthreads() : 1) - @_threadsfor threads for rng in 1:lastvalid - lo = starts[rng] - rng == lastvalid ? hi = nrows : hi = starts[rng+1] - 1 - part_res[Threads.threadid()] = _issorted_barrier(v, _ord, lo, hi) - !part_res[Threads.threadid()] && break + if threads + + Threads.@threads :static for rng in 1:lastvalid + lo = starts[rng] + rng == lastvalid ? hi = nrows : hi = starts[rng+1] - 1 + part_res[Threads.threadid()] = _issorted_barrier(v, _ord, lo, hi) + !part_res[Threads.threadid()] && break + end + else + for rng in 1:lastvalid + lo = starts[rng] + rng == lastvalid ? hi = nrows : hi = starts[rng+1] - 1 + part_res[Threads.threadid()] = _issorted_barrier(v, _ord, lo, hi) + !part_res[Threads.threadid()] && break + end end all(part_res) end diff --git a/src/sort/sortperm.jl b/src/sort/sortperm.jl index 9fc77631..cd6e2d84 100644 --- a/src/sort/sortperm.jl +++ b/src/sort/sortperm.jl @@ -29,7 +29,7 @@ end # we should find starts here function fast_sortperm_int_threaded!(x, original_P, copy_P, ranges, rangelen, minval, misatleft, last_valid_range, ::Val{T}) where T starts = [T[] for i in 1:Threads.nthreads()] - Threads.@threads for i in 1:last_valid_range + Threads.@threads :static for i in 1:last_valid_range rangestart = ranges[i] i == last_valid_range ? rangeend = length(x) : rangeend = ranges[i+1] - 1 # if (rangeend - rangestart) == 0 @@ -45,6 +45,8 @@ function fast_sortperm_int_threaded!(x, original_P, copy_P, ranges, rangelen, mi end cnt = 1 flag = false + #Threads@threads now does not keep the order of the runs, we help starts be sorted before shaping ranges + sort!(starts, by=x->isempty(x) ? missing : x[1]) @inbounds for i in 1:length(starts) for j in 1:length(starts[i]) ranges[cnt] = starts[i][j] @@ -103,29 +105,57 @@ function fast_sortperm_int!(x, original_P, copy_P, ranges, rangelen, minval, mis end function _sortperm_int!(idx, idx_cpy, x, ranges, where, last_valid_range, missingatleft, ord, a; threads = true) - @_threadsfor threads for i in 1:last_valid_range - rangestart = ranges[i] - i == last_valid_range ? rangeend = length(x) : rangeend = ranges[i+1] - 1 - if (rangeend - rangestart + 1) == 1 - continue - end - _minval = stat_minimum(x, lo = rangestart, hi = rangeend) - if ismissing(_minval) - continue - else - minval::Int = _minval + if threads + Threads.@threads :static for i in 1:last_valid_range + rangestart = ranges[i] + i == last_valid_range ? rangeend = length(x) : rangeend = ranges[i+1] - 1 + if (rangeend - rangestart + 1) == 1 + continue + end + _minval = stat_minimum(x, lo = rangestart, hi = rangeend) + if ismissing(_minval) + continue + else + minval::Int = _minval + end + maxval::Int = stat_maximum(x, lo = rangestart, hi = rangeend) + # the overflow is check before calling _sortperm_int! + rangelen = maxval - minval + 1 + if rangelen < div(rangeend - rangestart + 1, 2) + if missingatleft + ds_sort_int_missatleft!(x, idx, idx_cpy, where[Threads.threadid()], rangestart, rangeend, rangelen, minval) + else + ds_sort_int_missatright!(x, idx, idx_cpy, where[Threads.threadid()], rangestart, rangeend, rangelen, minval) + end + else + ds_sort!(x, idx, rangestart, rangeend, a, ord) + end end - maxval::Int = stat_maximum(x, lo = rangestart, hi = rangeend) - # the overflow is check before calling _sortperm_int! - rangelen = maxval - minval + 1 - if rangelen < div(rangeend - rangestart + 1, 2) - if missingatleft - ds_sort_int_missatleft!(x, idx, idx_cpy, where[Threads.threadid()], rangestart, rangeend, rangelen, minval) + else + for i in 1:last_valid_range + rangestart = ranges[i] + i == last_valid_range ? rangeend = length(x) : rangeend = ranges[i+1] - 1 + if (rangeend - rangestart + 1) == 1 + continue + end + _minval = stat_minimum(x, lo = rangestart, hi = rangeend) + if ismissing(_minval) + continue else - ds_sort_int_missatright!(x, idx, idx_cpy, where[Threads.threadid()], rangestart, rangeend, rangelen, minval) + minval::Int = _minval + end + maxval::Int = stat_maximum(x, lo = rangestart, hi = rangeend) + # the overflow is check before calling _sortperm_int! + rangelen = maxval - minval + 1 + if rangelen < div(rangeend - rangestart + 1, 2) + if missingatleft + ds_sort_int_missatleft!(x, idx, idx_cpy, where[Threads.threadid()], rangestart, rangeend, rangelen, minval) + else + ds_sort_int_missatright!(x, idx, idx_cpy, where[Threads.threadid()], rangestart, rangeend, rangelen, minval) + end + else + ds_sort!(x, idx, rangestart, rangeend, a, ord) end - else - ds_sort!(x, idx, rangestart, rangeend, a, ord) end end end @@ -133,7 +163,7 @@ end function _apply_by_f_barrier(x::AbstractVector{T}, by, rev, threads) where T needrev = rev missat = :right - CT = Core.Compiler.return_type(_date_value∘by, Tuple{nonmissingtype(T)}) + CT = Core.Compiler.return_type(_date_value∘by, Tuple{our_nonmissingtype(T)}) if CT == Bool CT = Int8 end @@ -141,7 +171,7 @@ function _apply_by_f_barrier(x::AbstractVector{T}, by, rev, threads) where T # _temp = Vector{CT}(undef, length(x)) _temp = _our_vect_alloc(CT, length(x)) # we should make sure changing sign doesn't overflow - if rev && nonmissingtype(CT) <: Union{Bool, Int8, Int16, Int32, Int64} && isless(typemin(nonmissingtype(CT)), threads ? hp_minimum(_date_value∘by, x) : stat_minimum(_date_value∘by, x)) + if rev && our_nonmissingtype(CT) <: Union{Bool, Int8, Int16, Int32, Int64} && isless(typemin(our_nonmissingtype(CT)), threads ? hp_minimum(_date_value∘by, x) : stat_minimum(_date_value∘by, x)) _by = x-> -_date_value(by(x)) needrev = false missat = :left diff --git a/src/stat/hp_stat.jl b/src/stat/hp_stat.jl index 5a5766da..ed5d6aa3 100644 --- a/src/stat/hp_stat.jl +++ b/src/stat/hp_stat.jl @@ -3,7 +3,7 @@ function hp_maximum(f, x::AbstractVector{T}) where {T} nt = Threads.nthreads() cz = div(n, nt) cz == 0 && return stat_maximum(f, x) - CT = Core.Compiler.return_type(f, Tuple{nonmissingtype(eltype(x))}) + CT = Core.Compiler.return_type(f, Tuple{our_nonmissingtype(eltype(x))}) if T >: Missing CT = Union{Missing,CT} end @@ -22,7 +22,7 @@ function hp_minimum(f, x::AbstractVector{T}) where {T} nt = Threads.nthreads() cz = div(n, nt) cz == 0 && return stat_minimum(f, x) - CT = Core.Compiler.return_type(f, Tuple{nonmissingtype(eltype(x))}) + CT = Core.Compiler.return_type(f, Tuple{our_nonmissingtype(eltype(x))}) if T >: Missing CT = Union{Missing,CT} end @@ -41,9 +41,9 @@ function hp_sum(f, x::AbstractVector{T}) where {T} nt = Threads.nthreads() cz = div(n, nt) cz == 0 && return stat_sum(f, x) - CT = Core.Compiler.return_type(f, Tuple{nonmissingtype(eltype(x))}) - CT <: Base.SmallSigned ? CT = Int : nothing - CT <: Base.SmallUnsigned ? CT = UInt : nothing + CT = Core.Compiler.return_type(f, Tuple{our_nonmissingtype(eltype(x))}) + CT <: SMALLSIGNED ? CT = Int : nothing + CT <: SMALLUNSIGNED ? CT = UInt : nothing CT <: Bool ? CT = Int : nothing if T >: Missing CT = Union{Missing,CT} diff --git a/src/stat/non_hp_stat.jl b/src/stat/non_hp_stat.jl index 8ad5c8f9..d44e3d71 100644 --- a/src/stat/non_hp_stat.jl +++ b/src/stat/non_hp_stat.jl @@ -119,7 +119,7 @@ function rescale(x, minx, maxx, minval, maxval) -(-maxx * minval + minx * maxval) / (maxx - minx) + (-minval + maxval) * x / (maxx - minx) end rescale(::Missing, minx, maxx, minval, maxval) = missing -rescale(x::Vector, minx, maxx, minval, maxval) = rescale.(x, minx, maxx, minval, maxval) +rescale(x::AbstractVector, minx, maxx, minval, maxval) = rescale.(x, minx, maxx, minval, maxval) rescale(x, minx, maxx) = rescale(x, minx, maxx, 0.0, 1.0) """ @@ -137,11 +137,10 @@ end # this is manual simd version for max(min) function function stat_maximum(f::typeof(identity), x::AbstractArray{T,1}; lo=1, hi=length(x)) where {T} all(ismissing, view(x, lo:hi)) && return missing - _dmiss(x) = ismissing(x) ? typemin(nonmissingtype(T)) : x + _dmiss(x) = ismissing(x) ? typemin(our_nonmissingtype(T)) : x Base.mapreduce_impl(_dmiss, max, x, lo, hi) end function stat_maximum(f::F, x::AbstractArray{T,1}; lo=1, hi=length(x)) where {F,T} - all(ismissing, view(x, lo:hi)) && return missing Base.mapreduce_impl(f, _stat_max_fun, x, lo, hi) end stat_maximum(x::AbstractArray{T,1}; lo=1, hi=length(x)) where {T} = stat_maximum(identity, x; lo=lo, hi=hi) @@ -162,11 +161,10 @@ stat_findmax(x::AbstractArray{T,1}) where {T} = stat_findmax(identity, x) function stat_minimum(f::typeof(identity), x::AbstractArray{T,1}; lo=1, hi=length(x)) where {T} all(ismissing, view(x, lo:hi)) && return missing - @inline _dmiss(x) = ismissing(x) ? typemax(nonmissingtype(T)) : x + @inline _dmiss(x) = ismissing(x) ? typemax(our_nonmissingtype(T)) : x Base.mapreduce_impl(_dmiss, min, x, lo, hi) end function stat_minimum(f::F, x::AbstractArray{T,1}; lo=1, hi=length(x)) where {F,T} - all(ismissing, view(x, lo:hi)) && return missing Base.mapreduce_impl(f, _stat_min_fun, x, lo, hi) end stat_minimum(x::AbstractArray{T,1}; lo=1, hi=length(x)) where {T} = stat_minimum(identity, x; lo=lo, hi=hi) @@ -180,9 +178,7 @@ stat_findmin(x::AbstractArray{T,1}) where {T} = stat_findmin(identity, x) function stat_sum(f, x::AbstractArray{T,1}; lo=1, hi=length(x)) where {T<:Union{Missing,INTEGERS,FLOATS}} - all(ismissing, view(x, lo:hi)) && return f(first(x)) - _dmiss(y) = ifelse(ismissing(f(y)), zero(T), f(y)) - Base.mapreduce_impl(_dmiss, _stat_add_sum, x, lo, hi) + Base.mapreduce_impl(f, _stat_add_sum, x, lo, hi) end stat_sum(x::AbstractArray{T,1}; lo=1, hi=length(x)) where {T<:Union{Missing,INTEGERS,FLOATS}} = stat_sum(identity, x; lo=lo, hi=hi) @@ -297,19 +293,21 @@ function stat_wmean(f, x::AbstractVector{T}, w::AbstractArray{S,1}) where {T} wh end stat_wmean(x::AbstractVector{T}, w::AbstractArray{S,1}) where {T} where {S} = stat_wmean(identity, x, w) - +_abs2_var_barrier(x,y,f::F) where F = abs2(f(x)-y) +_meanval_var_barrier(n, sval)::Union{Missing, Float64} = n == 0 ? missing : sval / n function stat_var(f, x::AbstractArray{T,1}, dof=true)::Union{Float64,Missing} where {T<:Union{Missing,INTEGERS,FLOATS}} - all(ismissing, x) && return missing + # all(ismissing, x) && return missing # any(ISNAN, x) && return convert(eltype(x), NaN) # meanval = stat_mean(f, x) # n = mapreduce(!ismissing∘f, +, x) sval = stat_sum(y -> f(y) * 1.0, x) n = mapreduce(!ismissing ∘ f, +, x) - meanval = n == 0 ? missing : sval / n + meanval = _meanval_var_barrier(n, sval) ss = 0.0 for i in 1:length(x) - ss = _stat_add_sum(ss, abs2(f(x[i]) - meanval)) + # ss = _stat_add_sum(ss, abs2(f(x[i]) - meanval)) + ss = _stat_add_sum(ss, _abs2_var_barrier(x[i], meanval, f)) end if n == 0 @@ -331,7 +329,7 @@ stat_std(x::AbstractArray{T,1}, dof=true) where {T} = stat_std(identity, x, dof) function stat_median(v::AbstractArray{T,1}) where {T} isempty(v) && throw(ArgumentError("median of an empty array is undefined, $(repr(v))")) all(ismissing, v) && return missing - (nonmissingtype(eltype(v)) <: AbstractFloat || nonmissingtype(eltype(v)) >: AbstractFloat) && any(ISNAN, v) && return convert(eltype(v), NaN) + (our_nonmissingtype(eltype(v)) <: AbstractFloat || our_nonmissingtype(eltype(v)) >: AbstractFloat) && any(ISNAN, v) && return convert(eltype(v), NaN) nmis::Int = mapreduce(ismissing, +, v) n = length(v) - nmis mid = div(1 + n, 2) @@ -343,10 +341,11 @@ function stat_median(v::AbstractArray{T,1}) where {T} end end +# TODO in julia1.9+ partialsort! allocates, and it is not a good idea if we need to call stat_median! many times function stat_median!(v::AbstractArray{T,1}) where {T} isempty(v) && throw(ArgumentError("median of an empty array is undefined, $(repr(v))")) all(ismissing, v) && return missing - (nonmissingtype(eltype(v)) <: AbstractFloat || nonmissingtype(eltype(v)) >: AbstractFloat) && any(ISNAN, v) && return convert(eltype(v), NaN) + (our_nonmissingtype(eltype(v)) <: AbstractFloat || our_nonmissingtype(eltype(v)) >: AbstractFloat) && any(ISNAN, v) && return convert(eltype(v), NaN) nmis::Int = mapreduce(ismissing, +, v) n = length(v) - nmis mid = div(1 + n, 2) diff --git a/src/stat/stat.jl b/src/stat/stat.jl index fe7d64be..d3d5ecbf 100644 --- a/src/stat/stat.jl +++ b/src/stat/stat.jl @@ -8,10 +8,16 @@ minimum(f, x; threads = false) = Base.minimum(f, x) minimum(x::AbstractArray{Union{Missing, T},1}; threads = false) where T <: Union{INTEGERS, FLOATS, TimeType}= isempty(x) ? throw(ArgumentError("empty arrays are not allowed")) : threads ? hp_minimum(identity, x) : stat_minimum(identity, x) minimum(x; threads = false) = Base.minimum(x) # TODO not optimised for simd - threads option is useless here / it is here because we have it for other types of data -maximum(f, x::AbstractVector{Union{Missing, T}}; threads = false) where T <: AbstractString = mapreduce(f, _stat_max_fun, x) -minimum(f, x::AbstractVector{Union{Missing, T}}; threads = false) where T <: AbstractString = mapreduce(f, _stat_min_fun, x) -maximum(x::AbstractVector{Union{Missing, T}}; threads = false) where T <: AbstractString = maximum(identity, x) -minimum(x::AbstractVector{Union{Missing, T}}; threads = false) where T <: AbstractString = minimum(identity, x) +# using Union{Missing, AbstractString} force to fall back to this definition for Vector{Missing} Julia >= 1.9 +if VERSION >= v"1.9" + _TASM_14329 = Union{Missing, AbstractString} +else + _TASM_14329 = AbstractString +end +maximum(f, x::AbstractVector{Union{Missing, T}}; threads = false) where T <: _TASM_14329 = mapreduce(f, _stat_max_fun, x) +minimum(f, x::AbstractVector{Union{Missing, T}}; threads = false) where T <: _TASM_14329 = mapreduce(f, _stat_min_fun, x) +maximum(x::AbstractVector{Union{Missing, T}}; threads = false) where T <: _TASM_14329 = maximum(identity, x) +minimum(x::AbstractVector{Union{Missing, T}}; threads = false) where T <: _TASM_14329 = minimum(identity, x) sum(f, x::AbstractArray{Union{Missing, T},1}; threads = false) where T <: Union{INTEGERS, FLOATS} = isempty(x) ? throw(ArgumentError("empty arrays are not allowed")) : threads ? hp_sum(f, x) : stat_sum(f, x) sum(f, x; threads = false)=Base.sum(f, x) diff --git a/src/subdataset/subdataset.jl b/src/subdataset/subdataset.jl index cc1f5091..19acb161 100644 --- a/src/subdataset/subdataset.jl +++ b/src/subdataset/subdataset.jl @@ -1,5 +1,5 @@ """ - SubDataset{<:Dataset, <:AbstractIndex, <:AbstractVector{Int}} <: Dataset + SubDataset{<:Dataset, <:AbstractIndex, <:AbstractVector{Int}, DateTime} <: Dataset A view of a `Dataset`. It is returned by a call to the `view` function on an `Dataset` if a collections of rows and columns are specified. @@ -90,6 +90,7 @@ struct SubDataset{D<:AbstractDataset, S<:AbstractIndex, T<:AbstractVector{Int}} parent::D colindex::S rows::T # maps from subds row indexes to parent row indexes + created::DateTime end _attributes(sds::SubDataset) = getfield(parent(sds), :attributes) @@ -111,11 +112,25 @@ Base.@propagate_inbounds function SubDataset(parent::Dataset, rows::AbstractVect sindex = SubIndex(index(parent), cols) # SubDataset without columns should not have any row if all(==(0), sindex.remap) - SubDataset(parent, sindex, Int[]) + SubDataset(parent, sindex, Int[], _get_lastmodified(_attributes(parent))) else - SubDataset(parent,sindex , rows) + SubDataset(parent,sindex , rows, _get_lastmodified(_attributes(parent))) end end + +Base.@propagate_inbounds function SubDataset(parent::Dataset, rows::AbstractVector{Int}, cols, created) + @boundscheck if !checkindex(Bool, axes(parent, 1), rows) + throw(BoundsError(parent, (rows, cols))) + end + sindex = SubIndex(index(parent), cols) + # SubDataset without columns should not have any row + if all(==(0), sindex.remap) + SubDataset(parent, sindex, Int[], created) + else + SubDataset(parent,sindex , rows, created) + end +end + Base.@propagate_inbounds SubDataset(parent::Dataset, ::Colon, cols) = SubDataset(parent, axes(parent, 1), cols) @inline SubDataset(parent::Dataset, row::Integer, cols) = @@ -144,7 +159,7 @@ Base.@propagate_inbounds function SubDataset(parent::Dataset, rows::AbstractVect end Base.@propagate_inbounds SubDataset(sds::SubDataset, rowind, cols) = - SubDataset(parent(sds), rows(sds)[rowind], parentcols(index(sds), cols)) + SubDataset(parent(sds), rows(sds)[rowind], parentcols(index(sds), cols), getfield(sds, :created)) Base.@propagate_inbounds SubDataset(sds::SubDataset, rowind::Bool, cols) = throw(ArgumentError("invalid row index of type Bool")) @@ -158,7 +173,7 @@ Base.@propagate_inbounds SubDataset(sds::SubDataset, rowind::Bool, cols) = Base.@propagate_inbounds SubDataset(sds::SubDataset, rowind::Bool, ::Colon) = throw(ArgumentError("invalid row index of type Bool")) Base.@propagate_inbounds SubDataset(sds::SubDataset, ::Colon, cols) = - SubDataset(parent(sds), rows(sds), parentcols(index(sds), cols)) + SubDataset(parent(sds), rows(sds), parentcols(index(sds), cols), getfield(sds, :created)) @inline SubDataset(sds::SubDataset, ::Colon, ::Colon) = sds # just for showing SubDataset @@ -202,15 +217,15 @@ Base.@propagate_inbounds Base.view(ads::AbstractDataset, ::typeof(!), colind::Co @inline Base.view(ads::AbstractDataset, rowinds, colind::Bool) = throw(ArgumentError("invalid column index $colind of type `Bool`")) -Base.@propagate_inbounds Base.view(ads::AbstractDataset, rowinds, +Base.@propagate_inbounds Base.view(parent::AbstractDataset, rowinds, colinds::MultiColumnIndex) = - SubDataset(ads, rowinds, colinds) -Base.@propagate_inbounds Base.view(ads::AbstractDataset, rowinds::typeof(!), + SubDataset(parent, rowinds, colinds) +Base.@propagate_inbounds Base.view(parent::AbstractDataset, rowinds::typeof(!), colinds::MultiColumnIndex) = - SubDataset(ads, :, colinds) -Base.@propagate_inbounds Base.view(ads::AbstractDataset, rowinds::Not, + SubDataset(parent, :, colinds) +Base.@propagate_inbounds Base.view(parent::AbstractDataset, rowinds::Not, colinds::MultiColumnIndex) = - SubDataset(ads, axes(ads, 1)[rowinds], colinds) + SubDataset(parent, axes(parent, 1)[rowinds], colinds) ############################################################################## ## diff --git a/test/broadcasting.jl b/test/broadcasting.jl index 461c725e..b4a63005 100644 --- a/test/broadcasting.jl +++ b/test/broadcasting.jl @@ -133,7 +133,7 @@ end end ds4 = (x -> ds[1, 1]).(ds) @test names(ds4) == names(ds) - @test all(isa.(eachcol(ds4), Ref(CategoricalArray))) + @test all(isa.(eachcol(ds4), DatasetColumn{Dataset, CategoricalVector{Union{Missing, String}, UInt32, String, CategoricalValue{String, UInt32}, Missing}})) @test all(eachcol(ds4) .== Ref(categorical(["a", "a"]))) ds5 = Dataset(x=Any[1, 2, 3], y=Any[1, 2.0, big(3)]) diff --git a/test/byrow.jl b/test/byrow.jl index 379a3070..ef4fc19a 100644 --- a/test/byrow.jl +++ b/test/byrow.jl @@ -414,4 +414,12 @@ end @test byrow(ds, fun123, (1,2,3)) == [1,-1.0,-9,2.5] fun123_2(x,y) = x == 1 && y < 0 ? true : false @test byrow(ds, fun123_2, (:x1, :x2)) == [false, false, true, false] +end + +@testset "byrow - nunique" begin + ds = Dataset(x=2.1, y=4611911198408756429, z=missing, k=-2.1) + @test byrow(ds, nunique, :)[1] == 4 + @test byrow(ds, nunique, :, count_missing = false)[1] == 3 + @test byrow(ds, nunique, :, by = abs)[1] == 3 + @test byrow(ds, nunique, :, by = abs, count_missing=false)[1] == 2 end \ No newline at end of file diff --git a/test/constructors.jl b/test/constructors.jl index 231a036c..a5d60f74 100644 --- a/test/constructors.jl +++ b/test/constructors.jl @@ -318,13 +318,13 @@ end @testset "column types" begin ds = Dataset(A = 1:3, B = 2:4, C = 3:5) - answer = [Array{Union{Missing, Int}, 1}, Array{Union{Missing, Int}, 1}, Array{Union{Missing, Int}, 1}] + answer = [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}] @test typeof.(eachcol(ds)) == answer ds[!, :D] = [4, 5, missing] - push!(answer, Vector{Union{Int, Missing}}) + push!(answer, DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}) @test typeof.(eachcol(ds)) == answer ds[!, :E] .= 'c' - push!(answer, Vector{Union{Missing, Char}}) + push!(answer, DatasetColumn{Dataset, Vector{Union{Missing, Char}}}) @test typeof.(eachcol(ds)) == answer end diff --git a/test/join.jl b/test/join.jl index 443c3879..f99aff6b 100644 --- a/test/join.jl +++ b/test/join.jl @@ -93,18 +93,25 @@ closefinance_tol10ms_noexact = Dataset([Union{Missing, DateTime}[DateTime("2016- @test closejoin(trades, quotes, on = :time, makeunique = true) == closefinance1 == closejoin(trades, quotes, on = :time, makeunique = true, method = :hash) @test innerjoin(name, job, on = :ID) == inner == innerjoin(name, job, on = :ID, threads = false) + @test innerjoin(name, sort(job, :ID), on = :ID) == inner == innerjoin(name, sort(job, :ID), on = :ID, threads = false) == innerjoin(name, sort(job, :ID, rev=true), on = :ID) @test innerjoin(name, job, on = :ID) == inner == innerjoin(name, job, on = :ID, method = :hash, threads = false) @test outerjoin(name, job, on = :ID) == outer == outerjoin(name, job, on = :ID, threads = false) + @test outerjoin(name, sort(job, :ID), on = :ID) == outer == outerjoin(name, sort(job, :ID), on = :ID, threads = false) == outerjoin(name, sort(job, :ID, rev=true), on = :ID) @test outerjoin(name, job, on = :ID) == outer == outerjoin(name, job, on = :ID, method = :hash, threads = false) @test leftjoin(name, job, on = :ID) == left == leftjoin(name, job, on = :ID, threads = false) + @test leftjoin(name, sort(job, :ID), on = :ID) == left == leftjoin(name, sort(job, :ID), on = :ID, threads = false) == leftjoin(name, sort(job, :ID, rev=true), on = :ID) @test leftjoin(name, job, on = :ID) == left == leftjoin(name, job, on = :ID, method = :hash, threads = false) - @test semijoin(name, job, on = :ID) == semi == semijoin(name, job, on = :ID, threads = false) + @test semijoin(name, job, on = :ID, method=:sort) == semi == semijoin(name, job, on = :ID, threads = false, method=:sort) + @test semijoin(name, sort(job, :ID), on = :ID, method=:sort) == semi == semijoin(name, sort(job, :ID), on = :ID, threads = false, method=:sort) == semijoin(name, sort(job, :ID, rev=true), on = :ID, method=:sort) @test semijoin(name, job, on = :ID) == semi == semijoin(name, job, on = :ID, method = :hash, threads = false) - @test antijoin(name, job, on = :ID) == anti == antijoin(name, job, on = :ID, threads = false) + @test antijoin(name, job, on = :ID, method=:sort) == anti == antijoin(name, job, on = :ID, threads = false, method=:sort) + @test antijoin(name, sort(job, :ID), on = :ID, method=:sort) == anti == antijoin(name, sort(job, :ID), on = :ID, threads = false, method=:sort) == antijoin(name, sort(job, :ID, rev=true), on = :ID, method=:sort) @test antijoin(name, job, on = :ID) == anti == antijoin(name, job, on = :ID, method = :hash, threads = false) @test closejoin(classA, grades, on = :mark) == closeone == closejoin(classA, grades, on = :mark, threads = false) + @test closejoin(classA, sort(grades, :mark), on = :mark) == closeone == closejoin(classA, sort(grades, :mark), on = :mark, threads = false) == closejoin(classA, sort(grades, :mark, rev=true), on = :mark) @test closejoin(classA, grades, on = :mark) == closeone == closejoin(classA, grades, on = :mark, method = :hash, threads = false) @test closejoin(trades, quotes, on = :time, makeunique = true) == closefinance1 == closejoin(trades, quotes, on = :time, makeunique = true, threads = false) + @test closejoin(trades, sort(quotes, :time), on = :time, makeunique = true) == closefinance1 == closejoin(trades, sort(quotes, :time), on = :time, makeunique = true, threads = false) == closejoin(trades, sort(quotes, :time, rev=true), on = :time, makeunique = true) @test closejoin(trades, quotes, on = :time, makeunique = true) == closefinance1 == closejoin(trades, quotes, on = :time, makeunique = true, method = :hash, threads = false) @test innerjoin(name, view(job, :, :), on = :ID) == inner @@ -139,13 +146,20 @@ closefinance_tol10ms_noexact = Dataset([Union{Missing, DateTime}[DateTime("2016- @test closejoin(trades, quotes, on =[:ticker, :time], tol = Millisecond(2)) == closfinance_tol2ms + @test closejoin(trades, sort(quotes, [:ticker, :time]), on =[:ticker, :time], tol = Millisecond(2)) == closfinance_tol2ms == closejoin(trades, sort(quotes, [:ticker, :time], rev=[true, false]), on =[:ticker, :time], tol = Millisecond(2)) @test closejoin(trades, quotes, on =[:ticker, :time], tol = Day(2)) == closejoin(trades, quotes, on =[:ticker, :time]) + @test closejoin(trades, sort(quotes, [:ticker, :time]), on =[:ticker, :time], tol = Day(2)) == closejoin(trades, quotes, on =[:ticker, :time]) == closejoin(trades, sort(quotes, [:ticker, :time], rev=[true, false]), on =[:ticker, :time], tol = Day(2)) @test closejoin(trades, quotes, on =[:ticker, :time], tol = Millisecond(0)) == closfinance_tol0ms + @test closejoin(trades, sort(quotes, [:ticker, :time]), on =[:ticker, :time], tol = Millisecond(0)) == closfinance_tol0ms == closejoin(trades, sort(quotes, [:ticker, :time], rev=[true, false]), on =[:ticker, :time], tol = Millisecond(0)) @test closejoin(trades, quotes, on = [:ticker, :time], tol = Millisecond(10), allow_exact_match = false) == closefinance_tol10ms_noexact + @test closejoin(trades, sort(quotes, [:ticker, :time]), on = [:ticker, :time], tol = Millisecond(10), allow_exact_match = false) == closefinance_tol10ms_noexact == closejoin(trades, sort(quotes, [:ticker, :time], rev=[true, false]), on = [:ticker, :time], tol = Millisecond(10), allow_exact_match = false) @test closejoin!(copy(trades), quotes, on =[:ticker, :time], tol = Millisecond(2)) == closfinance_tol2ms + @test closejoin!(copy(trades), sort(quotes, [:ticker, :time]), on =[:ticker, :time], tol = Millisecond(2)) == closfinance_tol2ms == closejoin!(copy(trades), sort(quotes, [:ticker, :time], rev=true), on =[:ticker, :time], tol = Millisecond(2)) @test closejoin!(copy(trades), quotes, on =[:ticker, :time], tol = Day(2)) == closejoin(trades, quotes, on =[:ticker, :time]) @test closejoin!(copy(trades), quotes, on =[:ticker, :time], tol = Millisecond(0)) == closfinance_tol0ms @test closejoin!(copy(trades), quotes, on = [:ticker, :time], tol = Millisecond(10), allow_exact_match = false) == closefinance_tol10ms_noexact + @test closejoin!(copy(trades), sort(quotes, [:ticker, :time]), on = [:ticker, :time], tol = Millisecond(10), allow_exact_match = false) == closefinance_tol10ms_noexact == closejoin!(copy(trades), sort(quotes, [:ticker, :time], rev=true), on = [:ticker, :time], tol = Millisecond(10), allow_exact_match = false) + @test closejoin(trades, quotes, on =[:ticker, :time], tol = Millisecond(2), method = :hash) == closfinance_tol2ms @test closejoin(trades, quotes, on =[:ticker, :time], tol = Day(2), method = :hash) == closejoin(trades, quotes, on =[:ticker, :time]) @@ -261,6 +275,13 @@ closefinance_tol10ms_noexact = Dataset([Union{Missing, DateTime}[DateTime("2016- @test leftjoin(nameid, jobid, on = :ID) == left[:, on] @test semijoin(nameid, jobid, on = :ID) == semi[:, on] @test antijoin(nameid, jobid, on = :ID) == anti[:, on] + + @test innerjoin(nameid, sort(jobid, :ID), on = :ID) == inner[:, on] + @test outerjoin(nameid, sort(jobid, :ID), on = :ID) == outer[:, on] + @test leftjoin(nameid, sort(jobid, :ID), on = :ID) == left[:, on] + @test semijoin(nameid, sort(jobid, :ID), on = :ID) == semi[:, on] + @test antijoin(nameid, sort(jobid, :ID), on = :ID) == anti[:, on] + @test innerjoin(nameid, view(jobid, :, :), on = :ID) == inner[:, on] @test outerjoin(nameid, view(jobid, :, :), on = :ID) == outer[:, on] @test leftjoin(nameid, view(jobid, :, :), on = :ID) == left[:, on] @@ -821,6 +842,10 @@ closefinance_tol10ms_noexact = Dataset([Union{Missing, DateTime}[DateTime("2016- @test left2 == leftjoin(dsl, dsr, on = [:x1, :x2], makeunique = true, accelerate = false, stable = true, check = false, threads = false) @test left2 == leftjoin(dsl, dsr, on = [:x1, :x2], makeunique = true, accelerate = false, stable = true, check = false, method = :hash, threads = false) + @test left2 == leftjoin(dsl, sort(dsr, [:x1, :x2]), on = [:x1, :x2], makeunique = true, accelerate = false, stable = true, check = false, method = :hash) + @test left2 == leftjoin(dsl, sort(dsr, [:x1, :x2]), on = [:x1, :x2], makeunique = true, accelerate = false, stable = true, check = false, threads = false) + @test left2 == leftjoin(dsl, sort(dsr, [:x1, :x2]), on = [:x1, :x2], makeunique = true, accelerate = false, stable = true, check = false, method = :hash, threads = false) + @test left1 == left2 @test unique(select!(left1, [:x1, :x2, :x3]), [:x1, :x2]) == unique(dsl, [:x1, :x2]) @@ -845,7 +870,10 @@ closefinance_tol10ms_noexact = Dataset([Union{Missing, DateTime}[DateTime("2016- semi1 = semijoin(dsl, dsr, on = [:x1, :x2]) @test semi1 == semijoin(dsl, dsr, on = [:x1, :x2], method = :hash) @test semi1 == semijoin(dsl, dsr, on = [:x1, :x2], threads = false) + @test semi1 == semijoin(dsl, sort(dsr, [:x1, :x2]), on = [:x1, :x2], threads = false) @test semi1 == semijoin(dsl, dsr, on = [:x1, :x2], method = :hash, threads = false) + @test semi1 == semijoin(dsl, sort(dsr, [:x1, :x2]), on = [:x1, :x2], method = :hash, threads = false) + semi2 = semijoin(dsl, dsr, on = [:x1, :x2], accelerate = true) @test semi1 == dsl @@ -866,12 +894,16 @@ closefinance_tol10ms_noexact = Dataset([Union{Missing, DateTime}[DateTime("2016- @test inn1 == innerjoin(dsl, dsr, on =[:x1, :x2], mapformats = [true, false], stable = true, method = :hash) @test out1 == outerjoin(dsl, dsr, on =[:x1, :x2], mapformats = [true, false], stable = true, method = :hash) @test left1 == leftjoin(dsl, dsr, on =[:x1, :x2], mapformats = [true, false], accelerate = true, stable =true, method = :hash) + @test left1 == leftjoin(dsl, sort(dsr, [:x1, :x2]), on =[:x1, :x2], mapformats = [true, false], accelerate = true, stable =true, method = :sort) + @test inn1 == innerjoin(dsl, dsr, on =[:x1, :x2], mapformats = [true, false], stable = true, threads = false) @test inn1 == innerjoin(dsl, dsr, on =[:x1, :x2], mapformats = [true, false], stable = true, method = :hash, threads = false) @test out1 == outerjoin(dsl, dsr, on =[:x1, :x2], mapformats = [true, false], stable = true, threads = false) @test out1 == outerjoin(dsl, dsr, on =[:x1, :x2], mapformats = [true, false], stable = true, method = :hash, threads = false) @test left1 == leftjoin(dsl, dsr, on =[:x1, :x2], mapformats = [true, false], accelerate = true, stable =true, threads = false) + @test left1 == leftjoin(dsl, sort(dsr, [:x1, :x2]), on =[:x1, :x2], mapformats = [true, false], accelerate = true, stable =true, threads = false) == leftjoin(dsl, sort(dsr, [:x1, :x2], rev=true), on =[:x1, :x2], mapformats = [true, false], accelerate = true, stable =true, threads = false) + @test left1 == leftjoin(dsl, dsr, on =[:x1, :x2], mapformats = [true, false], accelerate = true, stable =true, method = :hash, threads = false) @test inn1 == out1 == left1 @@ -915,7 +947,10 @@ closefinance_tol10ms_noexact = Dataset([Union{Missing, DateTime}[DateTime("2016- setformat!(dsl, 1:2=>fmtfun) semi1 = semijoin(dsl, dsr, on = [:x1, :x2], mapformats = [true, false]) @test semi1 == semijoin(dsl, dsr, on = [:x1, :x2], mapformats = [true, false], method =:hash) - @test semi1 == semijoin(dsl, dsr, on = [:x1, :x2], mapformats = [true, false], threads = false) + @test semi1 == semijoin(dsl, dsr, on = [:x1, :x2], mapformats = [true, false], threads = false, method=:sort) + @test semi1 == semijoin(dsl, sort(dsr, [:x1, :x2]), on = [:x1, :x2], mapformats = [true, false], threads = false, method=:sort) + @test semi1 == semijoin(dsl, sort(dsr, [:x1, :x2, :y2]), on = [:x1, :x2], mapformats = [true, false], threads = false, method=:sort) + @test semi1 == semijoin(dsl, dsr, on = [:x1, :x2], mapformats = [true, false], method = :hash, threads = false) @@ -924,6 +959,8 @@ closefinance_tol10ms_noexact = Dataset([Union{Missing, DateTime}[DateTime("2016- @test semi2 == dsl inn1 = innerjoin(dsl, dsr, on =[:x1, :x2], mapformats = [true, false], stable = true) out1 = outerjoin(dsl, dsr, on =[:x1, :x2], mapformats = [true, false], stable = true) + out1 = outerjoin(dsl, sort(dsr, [:x1, :x2]), on =[:x1, :x2], mapformats = [true, false], stable = true) + left1 = leftjoin(dsl, dsr, on =[:x1, :x2], mapformats = [true, false], accelerate = true, stable =true) @test inn1 == innerjoin(dsl, dsr, on =[:x1, :x2], mapformats = [true, false], stable = true, method = :hash) @@ -995,20 +1032,52 @@ closefinance_tol10ms_noexact = Dataset([Union{Missing, DateTime}[DateTime("2016- @test closejoin(dsl, dsr, on = [:x, :y], makeunique = true, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing]) @test closejoin(dsl, dsr, on = [:x, :y], makeunique = true, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300]) + @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,100, 300,300,300]) + @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing]) + @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300]) + + @test closejoin(dsl, sort(dsr, [:x, :y, :z]), on = [:x, :y], makeunique = true) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,100, 300,300,300]) + @test closejoin(dsl, sort(dsr, [:x, :y, :z]), on = [:x, :y], makeunique = true, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing]) + @test closejoin(dsl, sort(dsr, [:x, :y, :z]), on = [:x, :y], makeunique = true, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300]) + @test closejoin(dsl, dsr, on = [:x, :y], method = :hash, makeunique = true) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,100, 300,300,300]) @test closejoin(dsl, dsr, on = [:x, :y], method = :hash, makeunique = true, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing]) @test closejoin(dsl, dsr, on = [:x, :y], method = :hash, makeunique = true, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300]) + @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true, method=:hash) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,100, 300,300,300]) + @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true, method=:hash, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing]) + @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true, method=:hash, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300]) + + @test closejoin(dsl, sort(dsr, [:x, :y, :z]), on = [:x, :y], makeunique = true, method=:hash) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,100, 300,300,300]) + @test closejoin(dsl, sort(dsr, [:x, :y, :z]), on = [:x, :y], makeunique = true, method=:hash, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing]) + @test closejoin(dsl, sort(dsr, [:x, :y, :z]), on = [:x, :y], makeunique = true, method=:hash, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300]) + + dsl = Dataset(x = [1,1,1,2,2,2], y = ([6,4,1,2,5,3])) dsr = Dataset(x = [1,1,2], y = PooledArray([0,3,1]), z=[100,200,300]) @test closejoin(dsl, dsr, on = [:x, :y], makeunique = true) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,100, 300,300,300]) @test closejoin(dsl, dsr, on = [:x, :y], makeunique = true, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing]) @test closejoin(dsl, dsr, on = [:x, :y], makeunique = true, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300]) + @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,100, 300,300,300]) + @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing]) + @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300]) + + @test closejoin(dsl, sort(dsr, [:x, :y, :z], rev=true), on = [:x, :y], makeunique = true) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,100, 300,300,300]) + @test closejoin(dsl, sort(dsr, [:x, :y, :z], rev=true), on = [:x, :y], makeunique = true, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing]) + @test closejoin(dsl, sort(dsr, [:x, :y, :z], rev=true), on = [:x, :y], makeunique = true, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300]) + @test closejoin(dsl, dsr, on = [:x, :y], method = :hash, makeunique = true) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,100, 300,300,300]) @test closejoin(dsl, dsr, on = [:x, :y], method = :hash, makeunique = true, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing]) @test closejoin(dsl, dsr, on = [:x, :y], method = :hash, makeunique = true, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300]) + @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true, method=:hash) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,100, 300,300,300]) + @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true, method=:hash, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing]) + @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true, method=:hash, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300]) + + @test closejoin(dsl, sort(dsr, [:x, :y, :z], rev=true), on = [:x, :y], makeunique = true, method=:hash) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,100, 300,300,300]) + @test closejoin(dsl, sort(dsr, [:x, :y, :z], rev=true), on = [:x, :y], makeunique = true, method=:hash, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing]) + @test closejoin(dsl, sort(dsr, [:x, :y, :z], rev=true), on = [:x, :y], makeunique = true, method=:hash, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300]) dsl = Dataset(x = [1,1,1,2,2,2], y = PooledArray([6,4,1,2,5,3])) dsr = Dataset(x = [1,1,2], y = ([0,3,1]), z=[100,200,300]) @@ -1046,10 +1115,31 @@ closefinance_tol10ms_noexact = Dataset([Union{Missing, DateTime}[DateTime("2016- @test closejoin(dsl, dsr, on = [:x, :y], makeunique = true, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing]) @test closejoin(dsl, dsr, on = [:x, :y], makeunique = true, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300]) + @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,100, 300,300,300]) + @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing]) + @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300]) + + @test closejoin(dsl, sort(dsr, [:x, :y, :z]), on = [:x, :y], makeunique = true) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,100, 300,300,300]) + @test closejoin(dsl, sort(dsr, [:x, :y, :z]), on = [:x, :y], makeunique = true, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing]) + @test closejoin(dsl, sort(dsr, [:x, :y, :z]), on = [:x, :y], makeunique = true, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300]) + @test closejoin(dsl, dsr, on = [:x, :y], method = :hash, makeunique = true) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,100, 300,300,300]) @test closejoin(dsl, dsr, on = [:x, :y], method = :hash, makeunique = true, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing]) @test closejoin(dsl, dsr, on = [:x, :y], method = :hash, makeunique = true, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300]) + @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true, method=:hash) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,100, 300,300,300]) + @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true, method=:hash, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing]) + @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true, method=:hash, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300]) + + @test closejoin(dsl, sort(dsr, [:x, :y, :z]), on = [:x, :y], makeunique = true, method=:hash) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,100, 300,300,300]) + @test closejoin(dsl, sort(dsr, [:x, :y, :z]), on = [:x, :y], makeunique = true, method=:hash, direction = :forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, missing, 200, missing, missing, missing]) + @test closejoin(dsl, sort(dsr, [:x, :y, :z]), on = [:x, :y], makeunique = true, method=:hash, direction = :forward, border = :nearest) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[200,200,200, 300,300,300]) + + pushfirst!(dsr, (1,4,100)) + @test closejoin(dsl, dsr, on = [:x, :y], makeunique = true, direction=:forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, 100, 200, missing, missing, missing]) + @test closejoin(dsl, sort(dsr, [:x, :y]), on = [:x, :y], makeunique = true, direction=:forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, 100, 200, missing, missing, missing]) + @test closejoin(dsl, sort(dsr, [:x, :y], rev=true), on = [:x, :y], makeunique = true, direction=:forward) == Dataset(x=[1,1,1,2,2,2], y=[6,4,1,2,5,3],z=[missing, 100, 200, missing, missing, missing]) + #views for i in 1:100 l_ridx= rand(1:100, 200) @@ -1062,6 +1152,14 @@ closefinance_tol10ms_noexact = Dataset([Union{Missing, DateTime}[DateTime("2016- @test leftjoin(view(dsl, l_ridx, l_cidx), dsr, on =[:x1], makeunique=true, check = false) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), dsr, on =[:x1], makeunique=true, check = false) @test innerjoin(view(dsl, l_ridx, l_cidx), dsr, on =[:x1], makeunique=true, check = false) == innerjoin(Dataset(view(dsl, l_ridx, l_cidx)), dsr, on =[:x1], makeunique=true, check = false) @test outerjoin(view(dsl, l_ridx, l_cidx), dsr, on =[:x1], makeunique=true, check = false) == outerjoin(Dataset(view(dsl, l_ridx, l_cidx)), dsr, on =[:x1], makeunique=true, check = false) + + + @test leftjoin(view(dsl, l_ridx, l_cidx), sort(dsr, [:x1, :x2]), on =[:x1, :x2], makeunique=true, check = false) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), sort(dsr, [:x1, :x2]), on =[:x1, :x2], makeunique=true, check = false) + @test innerjoin(view(dsl, l_ridx, l_cidx), sort(dsr, [:x1, :x2]), on =[:x1, :x2], makeunique=true, check = false) == innerjoin(Dataset(view(dsl, l_ridx, l_cidx)), sort(dsr, [:x1, :x2]), on =[:x1, :x2], makeunique=true, check = false) + @test outerjoin(view(dsl, l_ridx, l_cidx), sort(dsr, [:x1, :x2]), on =[:x1, :x2], makeunique=true, check = false) == outerjoin(Dataset(view(dsl, l_ridx, l_cidx)), sort(dsr, [:x1, :x2]), on =[:x1, :x2], makeunique=true, check = false) + @test leftjoin(view(dsl, l_ridx, l_cidx), sort(dsr, [:x1, :x2]), on =[:x1], makeunique=true, check = false) == leftjoin(Dataset(view(dsl, l_ridx, l_cidx)), sort(dsr, [:x1, :x2]), on =[:x1], makeunique=true, check = false) + @test innerjoin(view(dsl, l_ridx, l_cidx), sort(dsr, [:x1, :x2]), on =[:x1], makeunique=true, check = false) == innerjoin(Dataset(view(dsl, l_ridx, l_cidx)), sort(dsr, [:x1, :x2]), on =[:x1], makeunique=true, check = false) + @test outerjoin(view(dsl, l_ridx, l_cidx), sort(dsr, [:x1, :x2]), on =[:x1], makeunique=true, check = false) == outerjoin(Dataset(view(dsl, l_ridx, l_cidx)), sort(dsr, [:x1, :x2]), on =[:x1], makeunique=true, check = false) end for i in 1:100 @@ -1429,53 +1527,51 @@ end s([:id, :fid]) == Dataset([[1, 3], [1, 3]], [:id, :fid]) @test typeof.(eachcol(s(:id))) == typeof.(eachcol(s(:fid))) == - typeof.(eachcol(s([:id, :fid]))) == [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}}] + typeof.(eachcol(s([:id, :fid]))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}] @test a(:id) == a(:fid) == a([:id, :fid]) == Dataset([[5], [5]], [:id, :fid]) @test typeof.(eachcol(a(:id))) == typeof.(eachcol(a(:fid))) == - typeof.(eachcol(a([:id, :fid]))) == [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}}] + typeof.(eachcol(a([:id, :fid]))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}] on = :id @test i(on) == Dataset([[1, 3], [1, 3], [1, 3]], [:id, :fid, :fid_1]) - @test typeof.(eachcol(i(on))) == [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}}, Vector{Union{Missing, Float64}}] + @test typeof.(eachcol(i(on))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}] @test l(on) ≅ Dataset(id = [1, 3, 5], fid = [1, 3, 5], fid_1 = [1, 3, missing]) @test typeof.(eachcol(l(on))) == - [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}}, Vector{Union{Float64, Missing}}] + [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}] @test o(on) ≅ Dataset(id = [1, 3, 5, 0, 2, 4], fid = [1, 3, 5, missing, missing, missing], fid_1 = [1, 3, missing, 0, 2, 4]) @test typeof.(eachcol(o(on))) == - [Vector{Union{Missing, Int}}, Vector{Union{Float64, Missing}}, Vector{Union{Float64, Missing}}] + [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}] on = :fid @test i(on) == Dataset([[1, 3], [1.0, 3.0], [1, 3]], [:id, :fid, :id_1]) - @test typeof.(eachcol(i(on))) == [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}}, Vector{Union{Missing, Int}}] + @test typeof.(eachcol(i(on))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Int}}}] @test l(on) ≅ Dataset(id = [1, 3, 5], fid = [1, 3, 5], id_1 = [1, 3, missing]) - @test typeof.(eachcol(l(on))) == [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}}, - Vector{Union{Int, Missing}}] + @test typeof.(eachcol(l(on))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Int}}}] @test o(on) ≅ Dataset(id = [1, 3, 5, missing, missing, missing], fid = [1, 3, 5, 0, 2, 4], id_1 = [1, 3, missing, 0, 2, 4]) - @test typeof.(eachcol(o(on))) == [Vector{Union{Int, Missing}}, Vector{Union{Missing, Float64}}, - Vector{Union{Int, Missing}}] + @test typeof.(eachcol(o(on))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Int}}}] on = [:id, :fid] @test i(on) == Dataset([[1, 3], [1, 3]], [:id, :fid]) - @test typeof.(eachcol(i(on))) == [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}}] + @test typeof.(eachcol(i(on))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}] @test l(on) == Dataset(id = [1, 3, 5], fid = [1, 3, 5]) - @test typeof.(eachcol(l(on))) == [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}}] + @test typeof.(eachcol(l(on))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}] @test o(on) == Dataset(id = [1, 3, 5, 0, 2, 4], fid = [1, 3, 5, 0, 2, 4]) - @test typeof.(eachcol(o(on))) == [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}}] + @test typeof.(eachcol(o(on))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}] i_hash(on) = innerjoin(ds1, ds2, on = on, makeunique=true, method = :hash) @@ -1489,53 +1585,52 @@ end s_hash([:id, :fid]) == Dataset([[1, 3], [1, 3]], [:id, :fid]) @test typeof.(eachcol(s_hash(:id))) == typeof.(eachcol(s_hash(:fid))) == - typeof.(eachcol(s_hash([:id, :fid]))) == [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}}] + typeof.(eachcol(s_hash([:id, :fid]))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}] @test a_hash(:id) == a_hash(:fid) == a_hash([:id, :fid]) == Dataset([[5], [5]], [:id, :fid]) @test typeof.(eachcol(a_hash(:id))) == typeof.(eachcol(a_hash(:fid))) == - typeof.(eachcol(a_hash([:id, :fid]))) == [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}}] + typeof.(eachcol(a_hash([:id, :fid]))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}] on = :id @test i_hash(on) == Dataset([[1, 3], [1, 3], [1, 3]], [:id, :fid, :fid_1]) - @test typeof.(eachcol(i_hash(on))) == [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}}, Vector{Union{Missing, Float64}}] + @test typeof.(eachcol(i_hash(on))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}] @test l_hash(on) ≅ Dataset(id = [1, 3, 5], fid = [1, 3, 5], fid_1 = [1, 3, missing]) @test typeof.(eachcol(l_hash(on))) == - [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}}, Vector{Union{Float64, Missing}}] + [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}},DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}] @test o_hash(on) ≅ Dataset(id = [1, 3, 5, 0, 2, 4], fid = [1, 3, 5, missing, missing, missing], fid_1 = [1, 3, missing, 0, 2, 4]) @test typeof.(eachcol(o_hash(on))) == - [Vector{Union{Missing, Int}}, Vector{Union{Float64, Missing}}, Vector{Union{Float64, Missing}}] + [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}},DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}] on = :fid @test i_hash(on) == Dataset([[1, 3], [1.0, 3.0], [1, 3]], [:id, :fid, :id_1]) - @test typeof.(eachcol(i_hash(on))) == [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}}, Vector{Union{Missing, Int}}] + @test typeof.(eachcol(i_hash(on))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Int}}}] @test l_hash(on) ≅ Dataset(id = [1, 3, 5], fid = [1, 3, 5], id_1 = [1, 3, missing]) - @test typeof.(eachcol(l_hash(on))) == [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}}, - Vector{Union{Int, Missing}}] + @test typeof.(eachcol(l_hash(on))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}, + DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}] @test o_hash(on) ≅ Dataset(id = [1, 3, 5, missing, missing, missing], fid = [1, 3, 5, 0, 2, 4], id_1 = [1, 3, missing, 0, 2, 4]) - @test typeof.(eachcol(o_hash(on))) == [Vector{Union{Int, Missing}}, Vector{Union{Missing, Float64}}, - Vector{Union{Int, Missing}}] + @test typeof.(eachcol(o_hash(on))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}},DatasetColumn{Dataset, Vector{Union{Missing, Int}}}] on = [:id, :fid] @test i_hash(on) == Dataset([[1, 3], [1, 3]], [:id, :fid]) - @test typeof.(eachcol(i_hash(on))) == [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}}] + @test typeof.(eachcol(i_hash(on))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}] @test l_hash(on) == Dataset(id = [1, 3, 5], fid = [1, 3, 5]) - @test typeof.(eachcol(l_hash(on))) == [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}}] + @test typeof.(eachcol(l_hash(on))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}] @test o_hash(on) == Dataset(id = [1, 3, 5, 0, 2, 4], fid = [1, 3, 5, 0, 2, 4]) - @test typeof.(eachcol(o_hash(on))) == [Vector{Union{Missing, Int}}, Vector{Union{Missing, Float64}}] + @test typeof.(eachcol(o_hash(on))) == [DatasetColumn{Dataset, Vector{Union{Missing, Int64}}}, DatasetColumn{Dataset, Vector{Union{Missing, Float64}}}] ##### dsl = Dataset(x=[1,2], y=[3,4]) diff --git a/test/stats.jl b/test/stats.jl index 3427e670..60a8e115 100644 --- a/test/stats.jl +++ b/test/stats.jl @@ -163,4 +163,57 @@ end @test isequal(IMD.cumprod(x4, missings = :skip), [missing,missing,missing,2]) @test isequal(IMD.cumprod(x5, missings = :skip), [missing,missing,-9.0,-18.0]) @test isequal(IMD.cumprod(x6, missings = :skip), [missing,missing, missing, missing]) +end +@testset "IMD.sum & IMD.mean & IMD.var" begin + x = Union{Missing, Int32}[missing, missing, missing, missing] + @test isequal(IMD.sum(x), missing) + @test IMD.sum(y->ismissing(y) ? 1 : y, x) == 4 + push!(x, 1) + @test IMD.sum(x) == 1 + @test IMD.sum(y->ismissing(y) ? 1 : y, x) == 5 + + @test IMD.mean(x) == 1 + @test ismissing(IMD.mean(y->isequal(y,1) ? missing : y, x) ) + @test IMD.mean(y->ismissing(y) ? 1 : y, x) == 1 + + @test isequal(IMD.var(x),missing) + @test isequal(IMD.var(x, false), 0.0) + + @test isequal(IMD.var(y->ismissing(y) ? 1 : y, x), 0.0) + @test isequal(IMD.var(y->ismissing(y) ? 1 : y, x, false), 0.0) + + x = [true, false, true, missing] + @test IMD.sum(x) == 2 + @test IMD.sum(y->isequal(y, true) ? 100 : y, x) == 200 + + for i in 1:10 + x=rand(1:10000, 100) + @test IMD.sum(x) == sum(x) + x = allowmissing(x) + x[50] = missing + @test IMD.sum(y->ismissing(y) ? 0 : y, x) == sum(y->ismissing(y) ? 0 : y, x) + end + if VERSION > v"1.8" # it causes problem in v"1.6", however, we can ignore it for those versions + x = rand(10) + n_a = [@allocated IMD.sum(x) for _ in 1:10] + @test n_a[end] <= 16 + + x = Union{Int32, Missing}[1,2,missing, 4] + n_a = [@allocated IMD.sum(x) for _ in 1:10] + @test n_a[end] == 0 + + n_a = [@allocated IMD.sum(y->ismissing(y) ? 0 : y, x) for _ in 1:10] + @test n_a[end] <= 16 + + x = rand(10) + n_a = [@allocated IMD.mean(x) for _ in 1:10] + @test n_a[end] <= 16 + + x = Union{Int32, Missing}[1,2,missing, 4] + n_a = [@allocated IMD.mean(x) for _ in 1:10] + @test n_a[end] <= 16 + + n_a = [@allocated IMD.mean(y->ismissing(y) ? 0 : y, x) for _ in 1:10] + @test n_a[end] <= 16 + end end \ No newline at end of file