From 058ee8bd73e32f947363978ca061f69f4a6328af Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sun, 19 Sep 2021 17:07:43 +0200 Subject: [PATCH 1/4] Support any `AbstractVector`s in `levels!` and `CategoricalPool` (#365) --- src/array.jl | 2 +- src/pool.jl | 4 ++-- test/04_constructors.jl | 7 +++++++ test/13_arraycommon.jl | 9 +++++++++ 4 files changed, 19 insertions(+), 3 deletions(-) diff --git a/src/array.jl b/src/array.jl index ddb6af6c..6c3cd8b2 100644 --- a/src/array.jl +++ b/src/array.jl @@ -759,7 +759,7 @@ If `A` accepts missing values (i.e. `eltype(A) >: Missing`) and `allowmissing=tr entries corresponding to omitted levels will be set to `missing`. Else, `newlevels` must include all levels which appear in the data. """ -function levels!(A::CategoricalArray{T, N, R}, newlevels::Vector; +function levels!(A::CategoricalArray{T, N, R}, newlevels::AbstractVector; allowmissing::Bool=false, allow_missing::Union{Bool, Nothing}=nothing) where {T, N, R} if allow_missing !== nothing diff --git a/src/pool.jl b/src/pool.jl index 5af7a88d..1018cfee 100644 --- a/src/pool.jl +++ b/src/pool.jl @@ -9,9 +9,9 @@ CategoricalPool{T, R}(ordered::Bool=false) where {T, R} = CategoricalPool{T}(ordered::Bool=false) where {T} = CategoricalPool{T, DefaultRefType}(T[], ordered) -CategoricalPool{T, R}(levels::Vector, ordered::Bool=false) where {T, R} = +CategoricalPool{T, R}(levels::AbstractVector, ordered::Bool=false) where {T, R} = CategoricalPool{T, R, CategoricalValue{T, R}}(convert(Vector{T}, levels), ordered) -CategoricalPool(levels::Vector{T}, ordered::Bool=false) where {T} = +CategoricalPool(levels::AbstractVector{T}, ordered::Bool=false) where {T} = CategoricalPool{T, DefaultRefType}(convert(Vector{T}, levels), ordered) CategoricalPool(invindex::Dict{T, R}, ordered::Bool=false) where {T, R <: Integer} = diff --git a/test/04_constructors.jl b/test/04_constructors.jl index e7990a07..5b39f95e 100644 --- a/test/04_constructors.jl +++ b/test/04_constructors.jl @@ -165,4 +165,11 @@ end @test_throws ArgumentError CategoricalPool(["a", "a"]) end +@testset "Constructor with various vector types" begin + @test CategoricalPool(2:4) == CategoricalPool(2.0:4.0) == + CategoricalPool([2, 3, 4]) + @test CategoricalPool(2:4, true) == CategoricalPool(2.0:4.0, true) == + CategoricalPool([2, 3, 4], true) +end + end \ No newline at end of file diff --git a/test/13_arraycommon.jl b/test/13_arraycommon.jl index 77cbe593..2ac04c51 100644 --- a/test/13_arraycommon.jl +++ b/test/13_arraycommon.jl @@ -1924,6 +1924,15 @@ end @test !isassigned(x, 3) end +@testset "levels! with various vector types" begin + for levs in (3:-1:1, categorical(3:-1:1), + 3.0:-1.0:1.0, categorical(3.0:-1.0:1.0)) + x = CategoricalVector([1, 2, 3]) + @test levels!(x, levs) === x + @test levels(x) == 3:-1:1 + end +end + # TODO: move struct definition inside @testset block once we require Julia 1.6 struct UnorderedBar <: Number a::String From f6bd5e7ecbe8a020be9cc0dd469deea16ea8ebf0 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Mon, 25 Oct 2021 14:03:29 +0200 Subject: [PATCH 2/4] Do not print type information when `:compact=>true` (#371) `:compact=>true` used to be recommended only to print fewer digits for numbers, but now the convention appears to be to use it more generally. This is not needed for `CatetoricalArray` printing as they already set `:typeinfo`, but it can help in other contexts such as when `NamedArray` dimension names are `CategoricalValue`s. --- src/value.jl | 3 ++- test/06_show.jl | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/value.jl b/src/value.jl index e2fe2eb5..7206698e 100644 --- a/src/value.jl +++ b/src/value.jl @@ -101,7 +101,8 @@ Base.convert(::Type{Union{S, Nothing}}, x::CategoricalValue) where {S <: Support Base.Broadcast.broadcastable(x::CategoricalValue) = Ref(x) function Base.show(io::IO, x::CategoricalValue) - if nonmissingtype(get(io, :typeinfo, Any)) === nonmissingtype(typeof(x)) + if get(io, :compact, false) || + nonmissingtype(get(io, :typeinfo, Any)) === nonmissingtype(typeof(x)) show(io, unwrap(x)) else print(io, typeof(x)) diff --git a/test/06_show.jl b/test/06_show.jl index 7ca4cf01..54a98f2e 100644 --- a/test/06_show.jl +++ b/test/06_show.jl @@ -47,6 +47,14 @@ using CategoricalArrays @test sprint(show, ov2, context=:typeinfo=>typeof(ov2)) == "\"b\"" @test sprint(show, ov3, context=:typeinfo=>typeof(ov3)) == "\"a\"" + @test sprint(show, nv1, context=:compact=>true) == "\"c\"" + @test sprint(show, nv2, context=:compact=>true) == "\"b\"" + @test sprint(show, nv3, context=:compact=>true) == "\"a\"" + + @test sprint(show, ov1, context=:compact=>true) == "\"c\"" + @test sprint(show, ov2, context=:compact=>true) == "\"b\"" + @test sprint(show, ov3, context=:compact=>true) == "\"a\"" + @test sprint(print, nv1) == sprint(print, ov1) == "c" @test sprint(print, nv2) == sprint(print, ov2) == "b" @test sprint(print, nv3) == sprint(print, ov3) == "a" From ee3ccfe6ea868ca24ddf0c358af421a6df1fc377 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Mon, 25 Oct 2021 14:05:11 +0200 Subject: [PATCH 3/4] Improve `cut` docstring (#372) The last interval is only closed on the right when `extend=true` is passed. --- src/extras.jl | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/extras.jl b/src/extras.jl index bf2aa0c0..fb9c4c9b 100644 --- a/src/extras.jl +++ b/src/extras.jl @@ -40,8 +40,9 @@ default_formatter(from, to, i; leftclosed, rightclosed) = Cut a numeric array into intervals at values `breaks` and return an ordered `CategoricalArray` indicating the interval into which each entry falls. Intervals are of the form `[lower, upper)`, -i.e. the lower bound is included and the upper bound is excluded, except for the last -interval which is closed on both ends, i.e. `[lower, upper]`. +i.e. the lower bound is included and the upper bound is excluded, except +if `extend=true` the last interval, which is then closed on both ends, +i.e. `[lower, upper]`. If `x` accepts missing values (i.e. `eltype(x) >: Missing`) the returned array will also accept them. @@ -101,11 +102,11 @@ julia> cut(-1:0.5:1, 3, labels=fmt) ``` """ @inline function cut(x::AbstractArray, breaks::AbstractVector; - extend::Union{Bool, Missing}=false, - labels::Union{AbstractVector{<:AbstractString},Function}=default_formatter, - allowmissing::Union{Bool, Nothing}=nothing, - allow_missing::Union{Bool, Nothing}=nothing, - allowempty::Bool=false) + extend::Union{Bool, Missing}=false, + labels::Union{AbstractVector{<:AbstractString},Function}=default_formatter, + allowmissing::Union{Bool, Nothing}=nothing, + allow_missing::Union{Bool, Nothing}=nothing, + allowempty::Bool=false) if allow_missing !== nothing Base.depwarn("allow_missing argument is deprecated, use extend=missing instead", :cut) @@ -188,7 +189,8 @@ function _cut(x::AbstractArray{T, N}, breaks::AbstractVector, leftclosed=breaks[end-1] != breaks[end], rightclosed=coalesce(extend, false)) else - length(labels) == n-1 || throw(ArgumentError("labels must be of length $(n-1), but got length $(length(labels))")) + length(labels) == n-1 || + throw(ArgumentError("labels must be of length $(n-1), but got length $(length(labels))")) # Levels must have element type String for type stability of the result levs::Vector{String} = copy(labels) end @@ -228,7 +230,7 @@ quantiles. * `labels::Union{AbstractVector,Function}`: a vector of strings giving the names to use for the intervals; or a function `f(from, to, i; leftclosed, rightclosed)` that generates the labels from the left and right interval boundaries and the group index. Defaults to - `"Qi: [from, to)"` (or `"Qi: [from, to]"` for the rightmost interval if `extend == true`). + `"Qi: [from, to)"` (or `"Qi: [from, to]"` for the rightmost interval). * `allowempty::Bool=false`: when `false`, an error is raised if some quantiles breakpoints are equal, generating empty intervals; when `true`, duplicate breaks are allowed and the intervals they generate are kept as unused levels From 1ed6f1aa87dd275ea36ce6ca1d65b5732a1f69bb Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sun, 7 Nov 2021 12:00:25 +0100 Subject: [PATCH 4/4] Bump version to 0.10.2 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 2569cef4..2d5b4f75 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "CategoricalArrays" uuid = "324d7699-5711-5eae-9e2f-1d82baa6b597" -version = "0.10.1" +version = "0.10.2" [deps] DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"