From 79ff7a727cb8d31ae45e2fc02f0b4586d997f636 Mon Sep 17 00:00:00 2001 From: Peter Gagarinov Date: Tue, 4 May 2021 13:04:26 +0300 Subject: [PATCH 1/9] Optimize recode for large number of categories (#345) --- benchmark/benchmarks.jl | 10 +++++++++- src/recode.jl | 17 ++++++++++++----- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index f320a3a6..5c2ae42b 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -86,4 +86,12 @@ SUITE["repeated assignment"]["empty dest"] = SUITE["repeated assignment"]["same levels dest"] = @benchmarkable mycopy!(c2, a) setup = c2=copy(c) SUITE["repeated assignment"]["many levels dest"] = - @benchmarkable mycopy!(d2, a) setup = d2=copy(d) \ No newline at end of file + @benchmarkable mycopy!(d2, a) setup = d2=copy(d) + +orig_vec = (x -> repeat(x, 32)).(string.([x % 1000 for x in 1:1000000])) +cat2merge_vec = (x -> repeat(x, 32)).(string.([x % 1000 for x in 1:100000])) +SUITE["recode"] = BenchmarkGroup() +SUITE["recode"]["vectors"] = @benchmarkable recode(orig_vec, cat2merge_vec => "None") +SUITE["recode"]["categorical_vectors"] = @benchmarkable recode(categorical(orig_vec), cat2merge_vec => "None") +SUITE["recode"]["matrices"] = @benchmarkable recode(reshape(orig_vec, :, 1), reshape(cat2merge_vec, :, 1) => "None") +SUITE["recode"]["categorical_matrices"] = @benchmarkable recode(categorical(reshape(orig_vec, :, 1)), reshape(cat2merge_vec, :, 1) => "None") diff --git a/src/recode.jl b/src/recode.jl index 25854670..0242a4c4 100644 --- a/src/recode.jl +++ b/src/recode.jl @@ -49,16 +49,21 @@ A user defined type could override this method to define an appropriate test fun @inline recode_in(x, collection::Set) = x in collection @inline recode_in(x, collection) = any(x ≅ y for y in collection) +optimize_pair(pair::Pair) = pair +optimize_pair(pair::Pair{<:AbstractArray}) = Set(pair.first) => pair.second + function recode!(dest::AbstractArray{T}, src::AbstractArray, default::Any, pairs::Pair...) where {T} if length(dest) != length(src) throw(DimensionMismatch("dest and src must be of the same length (got $(length(dest)) and $(length(src)))")) end + opt_pairs = map(optimize_pair, pairs) + @inbounds for i in eachindex(dest, src) x = src[i] - for j in 1:length(pairs) - p = pairs[j] + for j in 1:length(opt_pairs) + p = opt_pairs[j] # we use isequal and recode_in because we cannot really distinguish scalars from collections if x ≅ p.first || recode_in(x, p.first) dest[i] = p.second @@ -96,7 +101,9 @@ function recode!(dest::CategoricalArray{T}, src::AbstractArray, default::Any, pa throw(DimensionMismatch("dest and src must be of the same length (got $(length(dest)) and $(length(src)))")) end - vals = T[p.second for p in pairs] + opt_pairs = map(optimize_pair, pairs) + + vals = T[p.second for p in opt_pairs] default !== nothing && push!(vals, default) levels!(dest.pool, filter!(!ismissing, unique(vals))) @@ -110,8 +117,8 @@ function recode!(dest::CategoricalArray{T}, src::AbstractArray, default::Any, pa @inbounds for i in eachindex(drefs, src) x = src[i] - for j in 1:length(pairs) - p = pairs[j] + for j in 1:length(opt_pairs) + p = opt_pairs[j] # we use isequal and recode_in because we cannot really distinguish scalars from collections if x ≅ p.first || recode_in(x, p.first) drefs[i] = dupvals ? pairmap[j] : j From 3b517d318d73963334b324e74130705a1cb01349 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 17 May 2021 13:51:44 +0200 Subject: [PATCH 2/9] add sizehint! (#353) --- src/array.jl | 3 ++- test/13_arraycommon.jl | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/array.jl b/src/array.jl index 8597da63..78d74931 100644 --- a/src/array.jl +++ b/src/array.jl @@ -886,7 +886,8 @@ function Base.append!(A::CategoricalVector, B::CatArrOrSub) return A end -Base.empty!(A::CategoricalArray) = (empty!(A.refs); return A) +Base.empty!(A::CategoricalVector) = (empty!(A.refs); return A) +Base.sizehint!(A::CategoricalVector, sz::Integer) = (sizehint!(A.refs, sz); return A) function Base.reshape(A::CategoricalArray{T, N}, dims::Dims) where {T, N} x = reshape(A.refs, dims) diff --git a/test/13_arraycommon.jl b/test/13_arraycommon.jl index ed456832..0d897ba1 100644 --- a/test/13_arraycommon.jl +++ b/test/13_arraycommon.jl @@ -2069,4 +2069,12 @@ end @test res.args[2].data == [1, 3, 4, 2] end +@testset "sizehint! tests and additional empty! tests" begin + x = categorical([1]) + @test sizehint!(x, 1000) === x + @test x == [1] + @test_throws MethodError empty!(categorical([1 2; 3 4])) + @test_throws MethodError sizehint!(categorical([1 2; 3 4])) +end + end From 6803c3c79aac6196b770f64282baf2335ca4b45f Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Tue, 17 Aug 2021 12:19:12 +0200 Subject: [PATCH 3/9] droplevels!(): more efficient implementation (#359) --- src/array.jl | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/src/array.jl b/src/array.jl index 78d74931..46020650 100644 --- a/src/array.jl +++ b/src/array.jl @@ -836,7 +836,30 @@ unique(A::CategoricalArray{T}) where {T} = _unique(T, A.refs, A.pool) Drop levels which do not appear in categorical array `A` (so that they will no longer be returned by [`levels`](@ref DataAPI.levels)). """ -droplevels!(A::CategoricalArray) = levels!(A, intersect(levels(A), unique(A))) +function droplevels!(A::CategoricalArray) + arefs = refs(A) + nlevels = length(levels(A)) + 1 # +1 for missing + seen = fill(false, nlevels) + seen[1] = true # assume that missing is always observed to simplify checks + nseen = 1 + @inbounds for ref in arefs + if !seen[ref + 1] + seen[ref + 1] = true + nseen += 1 + (nseen == nlevels) && return A # all levels observed, nothing to drop + end + end + + # replace the pool + A.pool = typeof(pool(A))(@inbounds(levels(A)[view(seen, 2:nlevels)]), isordered(A)) + # recode refs to keep only the seen ones (optimized version of update_refs!()) + seen[1] = false # to start levelsmap from 0 + levelsmap = cumsum(seen) + @inbounds for i in eachindex(arefs) + arefs[i] = levelsmap[Int(arefs[i]) + 1] + end + return A +end """ isordered(A::CategoricalArray) From d9402f58120e9572acf9d4db776d7844acdef648 Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Thu, 19 Aug 2021 16:37:47 +0200 Subject: [PATCH 4/9] levels!(): enhance performance (#360) --- src/array.jl | 89 ++++++++++++++++++++++++++++------------- test/11_array.jl | 8 ++++ test/12_missingarray.jl | 13 +++++- test/13_arraycommon.jl | 61 +++++++++++++++++++++++++++- 4 files changed, 141 insertions(+), 30 deletions(-) diff --git a/src/array.jl b/src/array.jl index 46020650..431ce886 100644 --- a/src/array.jl +++ b/src/array.jl @@ -767,41 +767,74 @@ function levels!(A::CategoricalArray{T, N, R}, newlevels::Vector; :levels!) allowmissing = allow_missing end - if !allunique(newlevels) - throw(ArgumentError(string("duplicated levels found: ", - join(unique(filter(x->sum(newlevels.==x)>1, newlevels)), ", ")))) + (levels(A) == newlevels) && return A # nothing to do + + # map each new level to its ref code + newlv2ref = Dict{eltype(newlevels), Int}() + dupnewlvs = similar(newlevels, 0) + for (i, lv) in enumerate(newlevels) + if get!(newlv2ref, lv, i) != i + push!(dupnewlvs, lv) + end + end + if !isempty(dupnewlvs) + throw(ArgumentError(string("duplicated levels found: ", join(unique!(dupnewlvs), ", ")))) + end + + # map each old ref code to new ref code (or 0 if no such level) + oldlevels = levels(pool(A)) + oldref2newref = fill(0, length(oldlevels) + 1) + for (i, lv) in enumerate(oldlevels) + oldref2newref[i + 1] = get(newlv2ref, lv, 0) end - oldlevels = levels(A.pool) - - # first pass to check whether, if some levels are removed, changes can be applied without error - # TODO: save original levels and undo changes in case of error to skip this step - # equivalent to issubset but faster due to JuliaLang/julia#24624 - if !isempty(setdiff(oldlevels, newlevels)) - deleted = [!(l in newlevels) for l in oldlevels] - @inbounds for (i, x) in enumerate(A.refs) - if T >: Missing - !allowmissing && x > 0 && deleted[x] && - throw(ArgumentError("cannot remove level $(repr(oldlevels[x])) as it " * - "is used at position $i and allowmissing=false.")) - else - x > 0 && deleted[x] && - throw(ArgumentError("cannot remove level $(repr(oldlevels[x])) as it " * - "is used at position $i. Change the array element " * - "type to Union{$T, Missing} using convert if you want " * - "to transform some levels to missing values.")) + # create the new pool early (throws if the new levels could not be encoded with R) + newpool = CategoricalPool{nonmissingtype(T), R}(copy(newlevels), isordered(A)) + + # recode the refs + arefs = A.refs + # check whether potentially an error can occur due to a missing level + if (!(T >: Missing) || !allowmissing) && any(iszero, @view oldref2newref[2:end]) + # slow pass, check for missing levels + failedpos = 0 + @inbounds for (i, oldref) in enumerate(arefs) + newref = oldref2newref[oldref + 1] + if (oldref > 0) && (newref == 0) + failedpos = i + break end + arefs[i] = newref end - end - # replace the pool and recode refs to reflect new pool - if newlevels != oldlevels - newpool = CategoricalPool{nonmissingtype(T), R}(copy(newlevels), isordered(A.pool)) - update_refs!(A, newlevels) - A.pool = newpool + if failedpos > 0 # a missing at failedpos, revert the changes to A.refs + # build the inverse ref map + newref2oldref = fill(0, length(newlevels) + 1) + @inbounds for (oldref, newref) in enumerate(oldref2newref) + newref2oldref[newref + 1] = oldref - 1 + end + newref2oldref[1] = 0 # missing stays missing + # revert the refs + @inbounds for i in 1:(failedpos - 1) + arefs[i] = newref2oldref[arefs[i] + 1] + end + # throw an error + msg = "cannot remove level $(repr(oldlevels[arefs[failedpos]])) as it is used at position $failedpos" + if !(T >: Missing) + msg *= ". Change the array element type to Union{$T, Missing}" * + " using convert if you want to transform some levels to missing values." + elseif !allowmissing + msg *= " and allowmissing=false." + end + throw(ArgumentError(msg)) + end + else # fast pass, either introducing new missings is allowed or no new missings can occur + @inbounds for i in eachindex(arefs) + arefs[i] = oldref2newref[arefs[i] + 1] + end end + A.pool = newpool # update the pool - A + return A end function _unique(::Type{S}, diff --git a/test/11_array.jl b/test/11_array.jl index 1b8fa582..3056d3ce 100644 --- a/test/11_array.jl +++ b/test/11_array.jl @@ -188,8 +188,16 @@ using CategoricalArrays: DefaultRefType, leveltype @test x[3] === CategoricalValue(x.pool, 1) @test_throws ArgumentError levels!(x, ["a"]) + # check that x is restored correctly when dropping levels is not allowed + @test x == ["a", "b", "b"] + @test levels(x) == ["b", "a"] + @test_throws ArgumentError levels!(x, ["e", "b"]) + @test_throws ArgumentError levels!(x, ["e", "a", "b", "a"]) + # once again check that x is restored correctly when dropping levels is not allowed + @test x == ["a", "b", "b"] + @test levels(x) == ["b", "a"] @test levels!(x, ["e", "a", "b"]) === x @test levels(x) == ["e", "a", "b"] diff --git a/test/12_missingarray.jl b/test/12_missingarray.jl index 0768b45a..d4221921 100644 --- a/test/12_missingarray.jl +++ b/test/12_missingarray.jl @@ -200,8 +200,16 @@ const ≅ = isequal @test levels(x) == ["b", "a"] @test_throws ArgumentError levels!(x, ["a"]) + # check that x is restored correctly when dropping levels is not allowed + @test x == ["a", "b", "b"] + @test levels(x) == ["b", "a"] + @test_throws ArgumentError levels!(x, ["e", "b"]) + @test_throws ArgumentError levels!(x, ["e", "a", "b", "a"]) + # once again check that x is restored correctly when dropping levels is not allowed + @test x == ["a", "b", "b"] + @test levels(x) == ["b", "a"] @test levels!(x, ["e", "a", "b"]) === x @test levels(x) == ["e", "a", "b"] @@ -216,13 +224,16 @@ const ≅ = isequal @test x[3] === CategoricalValue(x.pool, 3) @test levels(x) == ["e", "a", "b", "c"] + # check once more that x is restored correctly when dropping levels is not allowed @test_throws ArgumentError levels!(x, ["e", "c"]) + @test x == ["c", "b", "b"] + @test levels(x) == ["e", "a", "b", "c"] + # check that with allowed missings the absent levels are converted to missing @test levels!(x, ["e", "c"], allowmissing=true) === x @test levels(x) == ["e", "c"] @test x[1] === CategoricalValue(x.pool, 2) @test x[2] === missing @test x[3] === missing - @test levels(x) == ["e", "c"] push!(x, "e") @test length(x) == 4 diff --git a/test/13_arraycommon.jl b/test/13_arraycommon.jl index 0d897ba1..728dafee 100644 --- a/test/13_arraycommon.jl +++ b/test/13_arraycommon.jl @@ -3,7 +3,7 @@ using Test using Missings using Future: copy! using CategoricalArrays, DataAPI -using CategoricalArrays: DefaultRefType +using CategoricalArrays: DefaultRefType, pool using PooledArrays using JSON3 using StructTypes @@ -2077,4 +2077,63 @@ end @test_throws MethodError sizehint!(categorical([1 2; 3 4])) end + +@testset "levels!() exceptions handling and rolling back to previous state" begin + orig = ["A", "B", "B", "C", "D", "B", "A"] + origmissing = convert(Vector{Union{String,Missing}}, orig) + origmissing[2] = missing + + @testset "throws if duplicate levels provided" begin + x = CategoricalArray(orig) + oldpool = pool(x) + @test_throws ArgumentError levels!(x, ["B", "A", "C", "D", "A"]) + @test x == orig + @test pool(x) == oldpool + @test levels(x) == ["A", "B", "C", "D"] + end + + @testset "can drop unused levels if element type is $(eltype(x0))" for x0 in (orig, origmissing) + x = CategoricalArray(x0) + levels!(x, ["E", "A", "B", "C", "D"]) + @test levels(x) == ["E", "A", "B", "C", "D"] + @test x === levels!(x, ["B", "A", "C", "D"]) + @test x ≅ x0 + @test levels(x) == ["B", "A", "C", "D"] + end + + @testset "CategoricalArray which cannot store missings" begin + x = CategoricalArray(orig) + @test levels(x) == ["A", "B", "C", "D"] + oldpool = pool(x) + @test_throws ArgumentError levels!(x, ["B", "A", "C"]) + # check that the x contents have not changed + @test x == orig + @test pool(x) === oldpool + @test levels(x) == ["A", "B", "C", "D"] + + # still throws even if allowmissing=true + @test_throws ArgumentError levels!(x, ["B", "A", "C"], allowmissing=true) + # check that the x contents have not changed + @test x == orig + @test pool(x) === oldpool + @test levels(x) == ["A", "B", "C", "D"] + end + + @testset "CategoricalArray which can store missing" begin + x = CategoricalArray(origmissing) + oldpool = pool(x) + @test levels(x) == ["A", "B", "C", "D"] + # throws if missings are not explicitly allowed + @test_throws ArgumentError levels!(x, ["B", "A", "C"]) + # check that the x contents have not changed + @test x ≅ origmissing + @test pool(x) === oldpool + @test levels(x) == ["A", "B", "C", "D"] + + @test x === levels!(x, ["B", "A", "C", "E"], allowmissing=true) + @test x ≅ ["A", missing, "B", "C", missing, "B", "A"] + @test levels(x) == ["B", "A", "C", "E"] + end +end + end From 7514576ca3466bead7c4396cb4b40a0764ffdf0a Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Wed, 15 Sep 2021 18:12:11 +0200 Subject: [PATCH 5/9] Fix doctests and make checks stricter (#367) `doctest=true` is now the default. --- docs/Project.toml | 2 +- docs/make.jl | 4 ++-- docs/src/using.md | 24 ++++++++++++------------ src/extras.jl | 10 +++++----- src/recode.jl | 22 +++++++++++----------- src/value.jl | 2 +- 6 files changed, 32 insertions(+), 32 deletions(-) diff --git a/docs/Project.toml b/docs/Project.toml index 1b9ab1f8..1a6d3094 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -2,4 +2,4 @@ Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" [compat] -Documenter = "0.24" +Documenter = "~0.27" diff --git a/docs/make.jl b/docs/make.jl index d8a08040..6a5e3be8 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -15,8 +15,8 @@ makedocs( "Implementation details" => "implementation.md", "API index" => "apiindex.md" ], - doctest = true, - checkdocs = :exports + checkdocs = :exports, + strict=true ) deploydocs( diff --git a/docs/src/using.md b/docs/src/using.md index d3c06a2d..8ba29f06 100644 --- a/docs/src/using.md +++ b/docs/src/using.md @@ -20,7 +20,7 @@ By default, the levels are lexically sorted, which is clearly not correct in our ```jldoctest using julia> levels(x) -3-element Array{String,1}: +3-element Vector{String}: "Middle" "Old" "Young" @@ -38,10 +38,10 @@ Thanks to this order, we can not only test for equality between two values, but ```jldoctest using julia> x[1] -CategoricalValue{String,UInt32} "Old" (3/3) +CategoricalValue{String, UInt32} "Old" (3/3) julia> x[2] -CategoricalValue{String,UInt32} "Young" (1/3) +CategoricalValue{String, UInt32} "Young" (1/3) julia> x[2] == x[4] true @@ -58,7 +58,7 @@ julia> x[1] = "Young" "Young" julia> x[1] -CategoricalValue{String,UInt32} "Young" (1/3) +CategoricalValue{String, UInt32} "Young" (1/3) ``` @@ -68,7 +68,7 @@ To get rid of the `"Old"` group, just call the [`droplevels!`](@ref) function: ```jldoctest using julia> levels(x) -3-element Array{String,1}: +3-element Vector{String}: "Young" "Middle" "Old" @@ -81,7 +81,7 @@ julia> droplevels!(x) "Young" julia> levels(x) -2-element Array{String,1}: +2-element Vector{String}: "Young" "Middle" @@ -139,7 +139,7 @@ Levels still need to be reordered manually: ```jldoctest using julia> levels(y) -3-element Array{String,1}: +3-element Vector{String}: "Middle" "Old" "Young" @@ -157,7 +157,7 @@ At this point, indexing into the array gives exactly the same result ```jldoctest using julia> y[1] -CategoricalValue{String,UInt32} "Old" (3/3) +CategoricalValue{String, UInt32} "Old" (3/3) ``` Missing values can be introduced either manually, or by restricting the set of possible levels. Let us imagine this time that we actually do not know the age of the first individual. We can set it to a missing value this way: @@ -225,7 +225,7 @@ julia> xy = vcat(x, y) "Middle" julia> levels(xy) -3-element Array{String,1}: +3-element Vector{String}: "Young" "Middle" "Old" @@ -237,7 +237,7 @@ true Likewise, assigning a `CategoricalValue` from `y` to an entry in `x` expands the levels of `x` with all levels from `y`, *respecting the ordering of levels of both vectors if possible*: ```jldoctest using julia> levels(x) -2-element Array{String,1}: +2-element Vector{String}: "Middle" "Old" @@ -270,7 +270,7 @@ julia> ab = vcat(a, b) "c" julia> levels(ab) -3-element Array{String,1}: +3-element Vector{String}: "a" "b" "c" @@ -294,7 +294,7 @@ julia> ab2 = vcat(a, b) "c" julia> levels(ab2) -3-element Array{String,1}: +3-element Vector{String}: "a" "b" "c" diff --git a/src/extras.jl b/src/extras.jl index e6d35dc4..bf2aa0c0 100644 --- a/src/extras.jl +++ b/src/extras.jl @@ -93,11 +93,11 @@ fmt (generic function with 1 method) julia> cut(-1:0.5:1, 3, labels=fmt) 5-element CategoricalArray{String,1,UInt32}: - "grp 1 (-1.0//-0.333333)" - "grp 1 (-1.0//-0.333333)" - "grp 2 (-0.333333//0.333333)" - "grp 3 (0.333333//1.0)" - "grp 3 (0.333333//1.0)" + "grp 1 (-1.0//-0.3333333333333335)" + "grp 1 (-1.0//-0.3333333333333335)" + "grp 2 (-0.3333333333333335//0.33333333333333326)" + "grp 3 (0.33333333333333326//1.0)" + "grp 3 (0.33333333333333326//1.0)" ``` """ @inline function cut(x::AbstractArray, breaks::AbstractVector; diff --git a/src/recode.jl b/src/recode.jl index 0242a4c4..282d4fb6 100644 --- a/src/recode.jl +++ b/src/recode.jl @@ -275,7 +275,7 @@ julia> x = collect(1:10); julia> recode!(x, 1=>100, 2:4=>0, [5; 9:10]=>-1); julia> x -10-element Array{Int64,1}: +10-element Vector{Int64}: 100 0 0 @@ -323,7 +323,7 @@ by the order of passed `pairs` and `default` will be the last level if provided. julia> using CategoricalArrays julia> recode(1:10, 1=>100, 2:4=>0, [5; 9:10]=>-1) -10-element Array{Int64,1}: +10-element Vector{Int64}: 100 0 0 @@ -348,16 +348,16 @@ will accept missing values. julia> using CategoricalArrays julia> recode(1:10, 1=>100, 2:4=>0, [5; 9:10]=>-1, 6=>missing) -10-element Array{Union{Missing, Int64},1}: - 100 - 0 - 0 - 0 - -1 +10-element Vector{Union{Missing, Int64}}: + 100 + 0 + 0 + 0 + -1 missing - 7 - 8 - -1 + 7 + 8 + -1 -1 ``` diff --git a/src/value.jl b/src/value.jl index 532e460e..94dee9f6 100644 --- a/src/value.jl +++ b/src/value.jl @@ -5,7 +5,7 @@ CategoricalValue(pool::CategoricalPool{T, R}, level::Integer) where {T, R} = CategoricalValue(value, source::Union{CategoricalValue, CategoricalArray}) Return a `CategoricalValue` object wrapping `value` and attached to -the [`CategoricalPool`](@ref) of `source`. +the `CategoricalPool` of `source`. """ function CategoricalValue(value, source::Union{CategoricalValue, CatArrOrSub}) p = pool(source) From 52f534ca9741530551e08ef5f3c21203d6904f46 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Wed, 15 Sep 2021 18:12:49 +0200 Subject: [PATCH 6/9] Fix type in error message (#366) --- src/value.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/value.jl b/src/value.jl index 94dee9f6..4ec789ee 100644 --- a/src/value.jl +++ b/src/value.jl @@ -155,7 +155,7 @@ end Base.isless(x::CategoricalValue, y::SupportedTypes) = throw(ArgumentError("cannot compare a `CategoricalValue` to value `v` of type " * - "`$(typeof(x))`: wrap `v` using `CategoricalValue(v, catvalue)` " * + "`$(typeof(y))`: wrap `v` using `CategoricalValue(v, catvalue)` " * "or `CategoricalValue(v, catarray)` first")) Base.isless(y::SupportedTypes, x::CategoricalValue) = isless(x, y) @@ -175,7 +175,7 @@ end Base.:<(x::CategoricalValue, y::SupportedTypes) = throw(ArgumentError("cannot compare a `CategoricalValue` to value `v` of type " * - "`$(typeof(x))`: wrap `v` using `CategoricalValue(v, catvalue)` " * + "`$(typeof(y))`: wrap `v` using `CategoricalValue(v, catvalue)` " * "or `CategoricalValue(v, catarray)` first")) Base.:<(y::SupportedTypes, x::CategoricalValue) = x < y From 1fe8ba96aa0003325636cdbc79a47628d273ea48 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Fri, 17 Sep 2021 09:17:17 +0200 Subject: [PATCH 7/9] Fix ambiguity error on construction from `ChainedVector` --- Project.toml | 2 ++ src/CategoricalArrays.jl | 1 + src/array.jl | 10 ++++++++++ test/13_arraycommon.jl | 19 +++++++++++++++++++ 4 files changed, 32 insertions(+) diff --git a/Project.toml b/Project.toml index df726605..1b02788b 100644 --- a/Project.toml +++ b/Project.toml @@ -9,6 +9,7 @@ JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01" +SentinelArrays = "91c51154-3ec4-41a3-a24f-3f23e20d615c" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StructTypes = "856f2bd8-1eba-4b0a-8007-ebc267875bd4" Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" @@ -19,6 +20,7 @@ JSON = "0.15, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21" JSON3 = "1.1.2" Missings = "0.4.3, 1" RecipesBase = "1.1" +SentinelArrays = "1" StructTypes = "1" julia = "1" diff --git a/src/CategoricalArrays.jl b/src/CategoricalArrays.jl index d1611451..a6c5b0ea 100644 --- a/src/CategoricalArrays.jl +++ b/src/CategoricalArrays.jl @@ -16,6 +16,7 @@ module CategoricalArrays using Missings using Printf import RecipesBase + import SentinelArrays import StructTypes # JuliaLang/julia#36810 diff --git a/src/array.jl b/src/array.jl index 431ce886..8ffc3428 100644 --- a/src/array.jl +++ b/src/array.jl @@ -630,6 +630,16 @@ copy!(dest::CatArrOrSub{<:Any, 1}, src::AbstractArray{<:Any, 1}) = copy!(dest::CatArrOrSub{T, 1}, src::AbstractArray{T, 1}) where {T} = copyto!(dest, 1, src, 1, length(src)) +copyto!(dest::CatArrOrSub{<:Any, 1}, src::SentinelArrays.ChainedVector) = + copyto!(dest, 1, src, 1, length(src)) +copyto!(dest::CatArrOrSub{<:Any, 1}, dstart::Union{Signed, Unsigned}, + src::SentinelArrays.ChainedVector, sstart::Union{Signed, Unsigned}, + n::Union{Signed, Unsigned}) = + invoke(copyto!, Tuple{AbstractArray, Union{Signed, Unsigned}, + SentinelArrays.ChainedVector, + Union{Signed, Unsigned}, Union{Signed, Unsigned}}, + dest, dstart, src, sstart, n) + similar(A::CategoricalArray{S, M, R}, ::Type{T}, dims::NTuple{N, Int}) where {T, N, S, M, R} = Array{T, N}(undef, dims) diff --git a/test/13_arraycommon.jl b/test/13_arraycommon.jl index 728dafee..77cbe593 100644 --- a/test/13_arraycommon.jl +++ b/test/13_arraycommon.jl @@ -9,6 +9,7 @@ using JSON3 using StructTypes using RecipesBase using Plots +using SentinelArrays const ≅ = isequal const ≇ = !isequal @@ -2134,6 +2135,24 @@ end @test x ≅ ["A", missing, "B", "C", missing, "B", "A"] @test levels(x) == ["B", "A", "C", "E"] end + + @testset "interaction with ChainedVector" begin + x = ChainedVector([["a", "b"], ["c", "d", "e"]]) + @test CategoricalArray(x) == CategoricalArray{String}(x) == + CategoricalArray{Union{String, Missing}}(x) == x + @test copy!(CategoricalArray{String}(undef, 5), x) == + copyto!(CategoricalArray{String}(undef, 5), x) == + copyto!(CategoricalArray{String}(undef, 5), 1, x, 1, 5) == + x + + x .= "z" + y = categorical(["a", "b", "c", "d", "e"]) + @test copy!(x, y) == y + x .= "z" + @test copyto!(x, y) == y + x .= "z" + @test copyto!(x, 1, y, 1, 5) == y + end end end From 7904fc94096c0b2a1eaa13f815588cf74f141a9d Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Fri, 17 Sep 2021 10:00:34 +0200 Subject: [PATCH 8/9] Make many dependencies optional using Requires.jl --- Project.toml | 13 +++++--- src/CategoricalArrays.jl | 68 +++++++++++++++++++++++++++++++++++++--- src/array.jl | 37 ---------------------- src/value.jl | 19 ++--------- 4 files changed, 74 insertions(+), 63 deletions(-) diff --git a/Project.toml b/Project.toml index 1b02788b..a02a994b 100644 --- a/Project.toml +++ b/Project.toml @@ -5,13 +5,10 @@ version = "0.10.0" [deps] DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" Future = "9fa8497b-333b-5362-9e8d-4d0656e87820" -JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" -RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01" -SentinelArrays = "91c51154-3ec4-41a3-a24f-3f23e20d615c" +Requires = "ae029012-a4dd-5104-9daa-d747884805df" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" -StructTypes = "856f2bd8-1eba-4b0a-8007-ebc267875bd4" Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" [compat] @@ -20,16 +17,22 @@ JSON = "0.15, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21" JSON3 = "1.1.2" Missings = "0.4.3, 1" RecipesBase = "1.1" +Requires = "1" SentinelArrays = "1" StructTypes = "1" julia = "1" [extras] Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" +JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" +RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01" +SentinelArrays = "91c51154-3ec4-41a3-a24f-3f23e20d615c" +StructTypes = "856f2bd8-1eba-4b0a-8007-ebc267875bd4" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Dates", "JSON3", "Plots", "PooledArrays", "Test"] +test = ["Dates", "JSON", "JSON3", "Plots", "PooledArrays", + "RecipesBase", "SentinelArrays", "StructTypes", "Test"] diff --git a/src/CategoricalArrays.jl b/src/CategoricalArrays.jl index a6c5b0ea..967cc368 100644 --- a/src/CategoricalArrays.jl +++ b/src/CategoricalArrays.jl @@ -11,13 +11,10 @@ module CategoricalArrays import DataAPI: unwrap export unwrap - using JSON using DataAPI using Missings using Printf - import RecipesBase - import SentinelArrays - import StructTypes + using Requires: @require # JuliaLang/julia#36810 if VERSION < v"1.5.2" @@ -37,4 +34,67 @@ module CategoricalArrays include("recode.jl") include("deprecated.jl") + + function __init__() + @require JSON="682c06a0-de6a-54ab-a142-c8b1cf79cde6" begin + # JSON of CategoricalValue is JSON of the value it refers to + JSON.lower(x::CategoricalValue) = JSON.lower(unwrap(x)) + end + + @require RecipesBase="3cdcf5f2-1ef4-517c-9805-6587b60abb01" @eval begin + RecipesBase.@recipe function f(::Type{T}, v::T) where T <: CategoricalValue + level_strings = [map(string, levels(v)); missing] + ticks --> eachindex(level_strings) + v -> ismissing(v) ? length(level_strings) : Int(refcode(v)), + i -> level_strings[Int(i)] + end + end + + @require SentinelArrays="91c51154-3ec4-41a3-a24f-3f23e20d615c" begin + copyto!(dest::CatArrOrSub{<:Any, 1}, src::SentinelArrays.ChainedVector) = + copyto!(dest, 1, src, 1, length(src)) + copyto!(dest::CatArrOrSub{<:Any, 1}, dstart::Union{Signed, Unsigned}, + src::SentinelArrays.ChainedVector, sstart::Union{Signed, Unsigned}, + n::Union{Signed, Unsigned}) = + invoke(copyto!, Tuple{AbstractArray, Union{Signed, Unsigned}, + SentinelArrays.ChainedVector, + Union{Signed, Unsigned}, Union{Signed, Unsigned}}, + dest, dstart, src, sstart, n) + end + + @require StructTypes="856f2bd8-1eba-4b0a-8007-ebc267875bd4" begin + # define appropriate handlers for JSON3 interface + StructTypes.StructType(x::CategoricalValue) = StructTypes.StructType(unwrap(x)) + StructTypes.StructType(::Type{<:CategoricalValue{T}}) where {T} = StructTypes.StructType(T) + StructTypes.numbertype(::Type{<:CategoricalValue{T}}) where {T <: Number} = T + StructTypes.construct(::Type{T}, x::CategoricalValue{T}) where {T} = T(unwrap(x)) + + # JSON3 writing/reading + StructTypes.StructType(::Type{<:CategoricalVector}) = StructTypes.ArrayType() + + StructTypes.construct(::Type{<:CategoricalArray}, A::AbstractVector) = + constructgeneral(A) + StructTypes.construct(::Type{<:CategoricalArray}, A::Vector) = + constructgeneral(A) + + function constructgeneral(A) + if eltype(A) === Any + # unlike `replace`, broadcast narrows the type, which allows us to return small + # union eltypes (e.g. Union{String,Missing}) + categorical(ifelse.(A .=== nothing, missing, A)) + elseif eltype(A) >: Nothing + categorical(replace(A, nothing=>missing)) + else + categorical(A) + end + end + + StructTypes.construct(::Type{<:CategoricalArray{Union{Missing, T}}}, + A::AbstractVector) where {T} = + CategoricalArray{Union{Missing, T}}(replace(A, nothing=>missing)) + StructTypes.construct(::Type{<:CategoricalArray{Union{Missing, T}}}, + A::Vector) where {T} = + CategoricalArray{Union{Missing, T}}(replace(A, nothing=>missing)) + end + end end diff --git a/src/array.jl b/src/array.jl index 8ffc3428..ddb6af6c 100644 --- a/src/array.jl +++ b/src/array.jl @@ -630,16 +630,6 @@ copy!(dest::CatArrOrSub{<:Any, 1}, src::AbstractArray{<:Any, 1}) = copy!(dest::CatArrOrSub{T, 1}, src::AbstractArray{T, 1}) where {T} = copyto!(dest, 1, src, 1, length(src)) -copyto!(dest::CatArrOrSub{<:Any, 1}, src::SentinelArrays.ChainedVector) = - copyto!(dest, 1, src, 1, length(src)) -copyto!(dest::CatArrOrSub{<:Any, 1}, dstart::Union{Signed, Unsigned}, - src::SentinelArrays.ChainedVector, sstart::Union{Signed, Unsigned}, - n::Union{Signed, Unsigned}) = - invoke(copyto!, Tuple{AbstractArray, Union{Signed, Unsigned}, - SentinelArrays.ChainedVector, - Union{Signed, Unsigned}, Union{Signed, Unsigned}}, - dest, dstart, src, sstart, n) - similar(A::CategoricalArray{S, M, R}, ::Type{T}, dims::NTuple{N, Int}) where {T, N, S, M, R} = Array{T, N}(undef, dims) @@ -1109,33 +1099,6 @@ Base.repeat(a::CatArrOrSub{T, N}; inner = nothing, outer = nothing) where {T, N} = CategoricalArray{T, N}(repeat(refs(a), inner=inner, outer=outer), copy(pool(a))) -# JSON3 writing/reading -StructTypes.StructType(::Type{<:CategoricalVector}) = StructTypes.ArrayType() - -StructTypes.construct(::Type{<:CategoricalArray}, A::AbstractVector) = - constructgeneral(A) -StructTypes.construct(::Type{<:CategoricalArray}, A::Vector) = - constructgeneral(A) - -function constructgeneral(A) - if eltype(A) === Any - # unlike `replace`, broadcast narrows the type, which allows us to return small - # union eltypes (e.g. Union{String,Missing}) - categorical(ifelse.(A .=== nothing, missing, A)) - elseif eltype(A) >: Nothing - categorical(replace(A, nothing=>missing)) - else - categorical(A) - end -end - -StructTypes.construct(::Type{<:CategoricalArray{Union{Missing, T}}}, - A::AbstractVector) where {T} = - CategoricalArray{Union{Missing, T}}(replace(A, nothing=>missing)) -StructTypes.construct(::Type{<:CategoricalArray{Union{Missing, T}}}, - A::Vector) where {T} = - CategoricalArray{Union{Missing, T}}(replace(A, nothing=>missing)) - # DataAPI refarray/refvalue/refpool support struct CategoricalRefPool{T, P} <: AbstractVector{T} pool::P diff --git a/src/value.jl b/src/value.jl index 4ec789ee..e2fe2eb5 100644 --- a/src/value.jl +++ b/src/value.jl @@ -179,22 +179,7 @@ Base.:<(x::CategoricalValue, y::SupportedTypes) = "or `CategoricalValue(v, catarray)` first")) Base.:<(y::SupportedTypes, x::CategoricalValue) = x < y -# JSON of CategoricalValue is JSON of the value it refers to -JSON.lower(x::CategoricalValue) = JSON.lower(unwrap(x)) DataAPI.defaultarray(::Type{CategoricalValue{T, R}}, N) where {T, R} = - CategoricalArray{T, N, R} + CategoricalArray{T, N, R} DataAPI.defaultarray(::Type{Union{CategoricalValue{T, R}, Missing}}, N) where {T, R} = - CategoricalArray{Union{T, Missing}, N, R} - -# define appropriate handlers for JSON3 interface -StructTypes.StructType(x::CategoricalValue) = StructTypes.StructType(unwrap(x)) -StructTypes.StructType(::Type{<:CategoricalValue{T}}) where {T} = StructTypes.StructType(T) -StructTypes.numbertype(::Type{<:CategoricalValue{T}}) where {T <: Number} = T -StructTypes.construct(::Type{T}, x::CategoricalValue{T}) where {T} = T(unwrap(x)) - -RecipesBase.@recipe function f(::Type{T}, v::T) where T <: CategoricalValue - level_strings = [map(string, levels(v)); missing] - ticks --> eachindex(level_strings) - v -> ismissing(v) ? length(level_strings) : Int(refcode(v)), - i -> level_strings[Int(i)] -end + CategoricalArray{Union{T, Missing}, N, R} \ No newline at end of file From 0bf20678f1933182aeed13ed1ed7ad1bf3dfad7b Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Fri, 17 Sep 2021 18:26:47 +0200 Subject: [PATCH 9/9] Bump version to 0.10.1 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index a02a994b..2569cef4 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "CategoricalArrays" uuid = "324d7699-5711-5eae-9e2f-1d82baa6b597" -version = "0.10.0" +version = "0.10.1" [deps] DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"