diff --git a/Project.toml b/Project.toml index df726605..2569cef4 100644 --- a/Project.toml +++ b/Project.toml @@ -1,16 +1,14 @@ name = "CategoricalArrays" uuid = "324d7699-5711-5eae-9e2f-1d82baa6b597" -version = "0.10.0" +version = "0.10.1" [deps] DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" Future = "9fa8497b-333b-5362-9e8d-4d0656e87820" -JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" -RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01" +Requires = "ae029012-a4dd-5104-9daa-d747884805df" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" -StructTypes = "856f2bd8-1eba-4b0a-8007-ebc267875bd4" Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" [compat] @@ -19,15 +17,22 @@ JSON = "0.15, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21" JSON3 = "1.1.2" Missings = "0.4.3, 1" RecipesBase = "1.1" +Requires = "1" +SentinelArrays = "1" StructTypes = "1" julia = "1" [extras] Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" +JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" +RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01" +SentinelArrays = "91c51154-3ec4-41a3-a24f-3f23e20d615c" +StructTypes = "856f2bd8-1eba-4b0a-8007-ebc267875bd4" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Dates", "JSON3", "Plots", "PooledArrays", "Test"] +test = ["Dates", "JSON", "JSON3", "Plots", "PooledArrays", + "RecipesBase", "SentinelArrays", "StructTypes", "Test"] diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index f320a3a6..5c2ae42b 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -86,4 +86,12 @@ SUITE["repeated assignment"]["empty dest"] = SUITE["repeated assignment"]["same levels dest"] = @benchmarkable mycopy!(c2, a) setup = c2=copy(c) SUITE["repeated assignment"]["many levels dest"] = - @benchmarkable mycopy!(d2, a) setup = d2=copy(d) \ No newline at end of file + @benchmarkable mycopy!(d2, a) setup = d2=copy(d) + +orig_vec = (x -> repeat(x, 32)).(string.([x % 1000 for x in 1:1000000])) +cat2merge_vec = (x -> repeat(x, 32)).(string.([x % 1000 for x in 1:100000])) +SUITE["recode"] = BenchmarkGroup() +SUITE["recode"]["vectors"] = @benchmarkable recode(orig_vec, cat2merge_vec => "None") +SUITE["recode"]["categorical_vectors"] = @benchmarkable recode(categorical(orig_vec), cat2merge_vec => "None") +SUITE["recode"]["matrices"] = @benchmarkable recode(reshape(orig_vec, :, 1), reshape(cat2merge_vec, :, 1) => "None") +SUITE["recode"]["categorical_matrices"] = @benchmarkable recode(categorical(reshape(orig_vec, :, 1)), reshape(cat2merge_vec, :, 1) => "None") diff --git a/docs/Project.toml b/docs/Project.toml index 1b9ab1f8..1a6d3094 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -2,4 +2,4 @@ Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" [compat] -Documenter = "0.24" +Documenter = "~0.27" diff --git a/docs/make.jl b/docs/make.jl index d8a08040..6a5e3be8 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -15,8 +15,8 @@ makedocs( "Implementation details" => "implementation.md", "API index" => "apiindex.md" ], - doctest = true, - checkdocs = :exports + checkdocs = :exports, + strict=true ) deploydocs( diff --git a/docs/src/using.md b/docs/src/using.md index d3c06a2d..8ba29f06 100644 --- a/docs/src/using.md +++ b/docs/src/using.md @@ -20,7 +20,7 @@ By default, the levels are lexically sorted, which is clearly not correct in our ```jldoctest using julia> levels(x) -3-element Array{String,1}: +3-element Vector{String}: "Middle" "Old" "Young" @@ -38,10 +38,10 @@ Thanks to this order, we can not only test for equality between two values, but ```jldoctest using julia> x[1] -CategoricalValue{String,UInt32} "Old" (3/3) +CategoricalValue{String, UInt32} "Old" (3/3) julia> x[2] -CategoricalValue{String,UInt32} "Young" (1/3) +CategoricalValue{String, UInt32} "Young" (1/3) julia> x[2] == x[4] true @@ -58,7 +58,7 @@ julia> x[1] = "Young" "Young" julia> x[1] -CategoricalValue{String,UInt32} "Young" (1/3) +CategoricalValue{String, UInt32} "Young" (1/3) ``` @@ -68,7 +68,7 @@ To get rid of the `"Old"` group, just call the [`droplevels!`](@ref) function: ```jldoctest using julia> levels(x) -3-element Array{String,1}: +3-element Vector{String}: "Young" "Middle" "Old" @@ -81,7 +81,7 @@ julia> droplevels!(x) "Young" julia> levels(x) -2-element Array{String,1}: +2-element Vector{String}: "Young" "Middle" @@ -139,7 +139,7 @@ Levels still need to be reordered manually: ```jldoctest using julia> levels(y) -3-element Array{String,1}: +3-element Vector{String}: "Middle" "Old" "Young" @@ -157,7 +157,7 @@ At this point, indexing into the array gives exactly the same result ```jldoctest using julia> y[1] -CategoricalValue{String,UInt32} "Old" (3/3) +CategoricalValue{String, UInt32} "Old" (3/3) ``` Missing values can be introduced either manually, or by restricting the set of possible levels. Let us imagine this time that we actually do not know the age of the first individual. We can set it to a missing value this way: @@ -225,7 +225,7 @@ julia> xy = vcat(x, y) "Middle" julia> levels(xy) -3-element Array{String,1}: +3-element Vector{String}: "Young" "Middle" "Old" @@ -237,7 +237,7 @@ true Likewise, assigning a `CategoricalValue` from `y` to an entry in `x` expands the levels of `x` with all levels from `y`, *respecting the ordering of levels of both vectors if possible*: ```jldoctest using julia> levels(x) -2-element Array{String,1}: +2-element Vector{String}: "Middle" "Old" @@ -270,7 +270,7 @@ julia> ab = vcat(a, b) "c" julia> levels(ab) -3-element Array{String,1}: +3-element Vector{String}: "a" "b" "c" @@ -294,7 +294,7 @@ julia> ab2 = vcat(a, b) "c" julia> levels(ab2) -3-element Array{String,1}: +3-element Vector{String}: "a" "b" "c" diff --git a/src/CategoricalArrays.jl b/src/CategoricalArrays.jl index d1611451..967cc368 100644 --- a/src/CategoricalArrays.jl +++ b/src/CategoricalArrays.jl @@ -11,12 +11,10 @@ module CategoricalArrays import DataAPI: unwrap export unwrap - using JSON using DataAPI using Missings using Printf - import RecipesBase - import StructTypes + using Requires: @require # JuliaLang/julia#36810 if VERSION < v"1.5.2" @@ -36,4 +34,67 @@ module CategoricalArrays include("recode.jl") include("deprecated.jl") + + function __init__() + @require JSON="682c06a0-de6a-54ab-a142-c8b1cf79cde6" begin + # JSON of CategoricalValue is JSON of the value it refers to + JSON.lower(x::CategoricalValue) = JSON.lower(unwrap(x)) + end + + @require RecipesBase="3cdcf5f2-1ef4-517c-9805-6587b60abb01" @eval begin + RecipesBase.@recipe function f(::Type{T}, v::T) where T <: CategoricalValue + level_strings = [map(string, levels(v)); missing] + ticks --> eachindex(level_strings) + v -> ismissing(v) ? length(level_strings) : Int(refcode(v)), + i -> level_strings[Int(i)] + end + end + + @require SentinelArrays="91c51154-3ec4-41a3-a24f-3f23e20d615c" begin + copyto!(dest::CatArrOrSub{<:Any, 1}, src::SentinelArrays.ChainedVector) = + copyto!(dest, 1, src, 1, length(src)) + copyto!(dest::CatArrOrSub{<:Any, 1}, dstart::Union{Signed, Unsigned}, + src::SentinelArrays.ChainedVector, sstart::Union{Signed, Unsigned}, + n::Union{Signed, Unsigned}) = + invoke(copyto!, Tuple{AbstractArray, Union{Signed, Unsigned}, + SentinelArrays.ChainedVector, + Union{Signed, Unsigned}, Union{Signed, Unsigned}}, + dest, dstart, src, sstart, n) + end + + @require StructTypes="856f2bd8-1eba-4b0a-8007-ebc267875bd4" begin + # define appropriate handlers for JSON3 interface + StructTypes.StructType(x::CategoricalValue) = StructTypes.StructType(unwrap(x)) + StructTypes.StructType(::Type{<:CategoricalValue{T}}) where {T} = StructTypes.StructType(T) + StructTypes.numbertype(::Type{<:CategoricalValue{T}}) where {T <: Number} = T + StructTypes.construct(::Type{T}, x::CategoricalValue{T}) where {T} = T(unwrap(x)) + + # JSON3 writing/reading + StructTypes.StructType(::Type{<:CategoricalVector}) = StructTypes.ArrayType() + + StructTypes.construct(::Type{<:CategoricalArray}, A::AbstractVector) = + constructgeneral(A) + StructTypes.construct(::Type{<:CategoricalArray}, A::Vector) = + constructgeneral(A) + + function constructgeneral(A) + if eltype(A) === Any + # unlike `replace`, broadcast narrows the type, which allows us to return small + # union eltypes (e.g. Union{String,Missing}) + categorical(ifelse.(A .=== nothing, missing, A)) + elseif eltype(A) >: Nothing + categorical(replace(A, nothing=>missing)) + else + categorical(A) + end + end + + StructTypes.construct(::Type{<:CategoricalArray{Union{Missing, T}}}, + A::AbstractVector) where {T} = + CategoricalArray{Union{Missing, T}}(replace(A, nothing=>missing)) + StructTypes.construct(::Type{<:CategoricalArray{Union{Missing, T}}}, + A::Vector) where {T} = + CategoricalArray{Union{Missing, T}}(replace(A, nothing=>missing)) + end + end end diff --git a/src/array.jl b/src/array.jl index 8597da63..ddb6af6c 100644 --- a/src/array.jl +++ b/src/array.jl @@ -767,41 +767,74 @@ function levels!(A::CategoricalArray{T, N, R}, newlevels::Vector; :levels!) allowmissing = allow_missing end - if !allunique(newlevels) - throw(ArgumentError(string("duplicated levels found: ", - join(unique(filter(x->sum(newlevels.==x)>1, newlevels)), ", ")))) + (levels(A) == newlevels) && return A # nothing to do + + # map each new level to its ref code + newlv2ref = Dict{eltype(newlevels), Int}() + dupnewlvs = similar(newlevels, 0) + for (i, lv) in enumerate(newlevels) + if get!(newlv2ref, lv, i) != i + push!(dupnewlvs, lv) + end + end + if !isempty(dupnewlvs) + throw(ArgumentError(string("duplicated levels found: ", join(unique!(dupnewlvs), ", ")))) + end + + # map each old ref code to new ref code (or 0 if no such level) + oldlevels = levels(pool(A)) + oldref2newref = fill(0, length(oldlevels) + 1) + for (i, lv) in enumerate(oldlevels) + oldref2newref[i + 1] = get(newlv2ref, lv, 0) end - oldlevels = levels(A.pool) - - # first pass to check whether, if some levels are removed, changes can be applied without error - # TODO: save original levels and undo changes in case of error to skip this step - # equivalent to issubset but faster due to JuliaLang/julia#24624 - if !isempty(setdiff(oldlevels, newlevels)) - deleted = [!(l in newlevels) for l in oldlevels] - @inbounds for (i, x) in enumerate(A.refs) - if T >: Missing - !allowmissing && x > 0 && deleted[x] && - throw(ArgumentError("cannot remove level $(repr(oldlevels[x])) as it " * - "is used at position $i and allowmissing=false.")) - else - x > 0 && deleted[x] && - throw(ArgumentError("cannot remove level $(repr(oldlevels[x])) as it " * - "is used at position $i. Change the array element " * - "type to Union{$T, Missing} using convert if you want " * - "to transform some levels to missing values.")) + # create the new pool early (throws if the new levels could not be encoded with R) + newpool = CategoricalPool{nonmissingtype(T), R}(copy(newlevels), isordered(A)) + + # recode the refs + arefs = A.refs + # check whether potentially an error can occur due to a missing level + if (!(T >: Missing) || !allowmissing) && any(iszero, @view oldref2newref[2:end]) + # slow pass, check for missing levels + failedpos = 0 + @inbounds for (i, oldref) in enumerate(arefs) + newref = oldref2newref[oldref + 1] + if (oldref > 0) && (newref == 0) + failedpos = i + break end + arefs[i] = newref end - end - # replace the pool and recode refs to reflect new pool - if newlevels != oldlevels - newpool = CategoricalPool{nonmissingtype(T), R}(copy(newlevels), isordered(A.pool)) - update_refs!(A, newlevels) - A.pool = newpool + if failedpos > 0 # a missing at failedpos, revert the changes to A.refs + # build the inverse ref map + newref2oldref = fill(0, length(newlevels) + 1) + @inbounds for (oldref, newref) in enumerate(oldref2newref) + newref2oldref[newref + 1] = oldref - 1 + end + newref2oldref[1] = 0 # missing stays missing + # revert the refs + @inbounds for i in 1:(failedpos - 1) + arefs[i] = newref2oldref[arefs[i] + 1] + end + # throw an error + msg = "cannot remove level $(repr(oldlevels[arefs[failedpos]])) as it is used at position $failedpos" + if !(T >: Missing) + msg *= ". Change the array element type to Union{$T, Missing}" * + " using convert if you want to transform some levels to missing values." + elseif !allowmissing + msg *= " and allowmissing=false." + end + throw(ArgumentError(msg)) + end + else # fast pass, either introducing new missings is allowed or no new missings can occur + @inbounds for i in eachindex(arefs) + arefs[i] = oldref2newref[arefs[i] + 1] + end end + A.pool = newpool # update the pool - A + return A end function _unique(::Type{S}, @@ -836,7 +869,30 @@ unique(A::CategoricalArray{T}) where {T} = _unique(T, A.refs, A.pool) Drop levels which do not appear in categorical array `A` (so that they will no longer be returned by [`levels`](@ref DataAPI.levels)). """ -droplevels!(A::CategoricalArray) = levels!(A, intersect(levels(A), unique(A))) +function droplevels!(A::CategoricalArray) + arefs = refs(A) + nlevels = length(levels(A)) + 1 # +1 for missing + seen = fill(false, nlevels) + seen[1] = true # assume that missing is always observed to simplify checks + nseen = 1 + @inbounds for ref in arefs + if !seen[ref + 1] + seen[ref + 1] = true + nseen += 1 + (nseen == nlevels) && return A # all levels observed, nothing to drop + end + end + + # replace the pool + A.pool = typeof(pool(A))(@inbounds(levels(A)[view(seen, 2:nlevels)]), isordered(A)) + # recode refs to keep only the seen ones (optimized version of update_refs!()) + seen[1] = false # to start levelsmap from 0 + levelsmap = cumsum(seen) + @inbounds for i in eachindex(arefs) + arefs[i] = levelsmap[Int(arefs[i]) + 1] + end + return A +end """ isordered(A::CategoricalArray) @@ -886,7 +942,8 @@ function Base.append!(A::CategoricalVector, B::CatArrOrSub) return A end -Base.empty!(A::CategoricalArray) = (empty!(A.refs); return A) +Base.empty!(A::CategoricalVector) = (empty!(A.refs); return A) +Base.sizehint!(A::CategoricalVector, sz::Integer) = (sizehint!(A.refs, sz); return A) function Base.reshape(A::CategoricalArray{T, N}, dims::Dims) where {T, N} x = reshape(A.refs, dims) @@ -1042,33 +1099,6 @@ Base.repeat(a::CatArrOrSub{T, N}; inner = nothing, outer = nothing) where {T, N} = CategoricalArray{T, N}(repeat(refs(a), inner=inner, outer=outer), copy(pool(a))) -# JSON3 writing/reading -StructTypes.StructType(::Type{<:CategoricalVector}) = StructTypes.ArrayType() - -StructTypes.construct(::Type{<:CategoricalArray}, A::AbstractVector) = - constructgeneral(A) -StructTypes.construct(::Type{<:CategoricalArray}, A::Vector) = - constructgeneral(A) - -function constructgeneral(A) - if eltype(A) === Any - # unlike `replace`, broadcast narrows the type, which allows us to return small - # union eltypes (e.g. Union{String,Missing}) - categorical(ifelse.(A .=== nothing, missing, A)) - elseif eltype(A) >: Nothing - categorical(replace(A, nothing=>missing)) - else - categorical(A) - end -end - -StructTypes.construct(::Type{<:CategoricalArray{Union{Missing, T}}}, - A::AbstractVector) where {T} = - CategoricalArray{Union{Missing, T}}(replace(A, nothing=>missing)) -StructTypes.construct(::Type{<:CategoricalArray{Union{Missing, T}}}, - A::Vector) where {T} = - CategoricalArray{Union{Missing, T}}(replace(A, nothing=>missing)) - # DataAPI refarray/refvalue/refpool support struct CategoricalRefPool{T, P} <: AbstractVector{T} pool::P diff --git a/src/extras.jl b/src/extras.jl index e6d35dc4..bf2aa0c0 100644 --- a/src/extras.jl +++ b/src/extras.jl @@ -93,11 +93,11 @@ fmt (generic function with 1 method) julia> cut(-1:0.5:1, 3, labels=fmt) 5-element CategoricalArray{String,1,UInt32}: - "grp 1 (-1.0//-0.333333)" - "grp 1 (-1.0//-0.333333)" - "grp 2 (-0.333333//0.333333)" - "grp 3 (0.333333//1.0)" - "grp 3 (0.333333//1.0)" + "grp 1 (-1.0//-0.3333333333333335)" + "grp 1 (-1.0//-0.3333333333333335)" + "grp 2 (-0.3333333333333335//0.33333333333333326)" + "grp 3 (0.33333333333333326//1.0)" + "grp 3 (0.33333333333333326//1.0)" ``` """ @inline function cut(x::AbstractArray, breaks::AbstractVector; diff --git a/src/recode.jl b/src/recode.jl index 25854670..282d4fb6 100644 --- a/src/recode.jl +++ b/src/recode.jl @@ -49,16 +49,21 @@ A user defined type could override this method to define an appropriate test fun @inline recode_in(x, collection::Set) = x in collection @inline recode_in(x, collection) = any(x ≅ y for y in collection) +optimize_pair(pair::Pair) = pair +optimize_pair(pair::Pair{<:AbstractArray}) = Set(pair.first) => pair.second + function recode!(dest::AbstractArray{T}, src::AbstractArray, default::Any, pairs::Pair...) where {T} if length(dest) != length(src) throw(DimensionMismatch("dest and src must be of the same length (got $(length(dest)) and $(length(src)))")) end + opt_pairs = map(optimize_pair, pairs) + @inbounds for i in eachindex(dest, src) x = src[i] - for j in 1:length(pairs) - p = pairs[j] + for j in 1:length(opt_pairs) + p = opt_pairs[j] # we use isequal and recode_in because we cannot really distinguish scalars from collections if x ≅ p.first || recode_in(x, p.first) dest[i] = p.second @@ -96,7 +101,9 @@ function recode!(dest::CategoricalArray{T}, src::AbstractArray, default::Any, pa throw(DimensionMismatch("dest and src must be of the same length (got $(length(dest)) and $(length(src)))")) end - vals = T[p.second for p in pairs] + opt_pairs = map(optimize_pair, pairs) + + vals = T[p.second for p in opt_pairs] default !== nothing && push!(vals, default) levels!(dest.pool, filter!(!ismissing, unique(vals))) @@ -110,8 +117,8 @@ function recode!(dest::CategoricalArray{T}, src::AbstractArray, default::Any, pa @inbounds for i in eachindex(drefs, src) x = src[i] - for j in 1:length(pairs) - p = pairs[j] + for j in 1:length(opt_pairs) + p = opt_pairs[j] # we use isequal and recode_in because we cannot really distinguish scalars from collections if x ≅ p.first || recode_in(x, p.first) drefs[i] = dupvals ? pairmap[j] : j @@ -268,7 +275,7 @@ julia> x = collect(1:10); julia> recode!(x, 1=>100, 2:4=>0, [5; 9:10]=>-1); julia> x -10-element Array{Int64,1}: +10-element Vector{Int64}: 100 0 0 @@ -316,7 +323,7 @@ by the order of passed `pairs` and `default` will be the last level if provided. julia> using CategoricalArrays julia> recode(1:10, 1=>100, 2:4=>0, [5; 9:10]=>-1) -10-element Array{Int64,1}: +10-element Vector{Int64}: 100 0 0 @@ -341,16 +348,16 @@ will accept missing values. julia> using CategoricalArrays julia> recode(1:10, 1=>100, 2:4=>0, [5; 9:10]=>-1, 6=>missing) -10-element Array{Union{Missing, Int64},1}: - 100 - 0 - 0 - 0 - -1 +10-element Vector{Union{Missing, Int64}}: + 100 + 0 + 0 + 0 + -1 missing - 7 - 8 - -1 + 7 + 8 + -1 -1 ``` diff --git a/src/value.jl b/src/value.jl index 532e460e..e2fe2eb5 100644 --- a/src/value.jl +++ b/src/value.jl @@ -5,7 +5,7 @@ CategoricalValue(pool::CategoricalPool{T, R}, level::Integer) where {T, R} = CategoricalValue(value, source::Union{CategoricalValue, CategoricalArray}) Return a `CategoricalValue` object wrapping `value` and attached to -the [`CategoricalPool`](@ref) of `source`. +the `CategoricalPool` of `source`. """ function CategoricalValue(value, source::Union{CategoricalValue, CatArrOrSub}) p = pool(source) @@ -155,7 +155,7 @@ end Base.isless(x::CategoricalValue, y::SupportedTypes) = throw(ArgumentError("cannot compare a `CategoricalValue` to value `v` of type " * - "`$(typeof(x))`: wrap `v` using `CategoricalValue(v, catvalue)` " * + "`$(typeof(y))`: wrap `v` using `CategoricalValue(v, catvalue)` " * "or `CategoricalValue(v, catarray)` first")) Base.isless(y::SupportedTypes, x::CategoricalValue) = isless(x, y) @@ -175,26 +175,11 @@ end Base.:<(x::CategoricalValue, y::SupportedTypes) = throw(ArgumentError("cannot compare a `CategoricalValue` to value `v` of type " * - "`$(typeof(x))`: wrap `v` using `CategoricalValue(v, catvalue)` " * + "`$(typeof(y))`: wrap `v` using `CategoricalValue(v, catvalue)` " * "or `CategoricalValue(v, catarray)` first")) Base.:<(y::SupportedTypes, x::CategoricalValue) = x < y -# JSON of CategoricalValue is JSON of the value it refers to -JSON.lower(x::CategoricalValue) = JSON.lower(unwrap(x)) DataAPI.defaultarray(::Type{CategoricalValue{T, R}}, N) where {T, R} = - CategoricalArray{T, N, R} + CategoricalArray{T, N, R} DataAPI.defaultarray(::Type{Union{CategoricalValue{T, R}, Missing}}, N) where {T, R} = - CategoricalArray{Union{T, Missing}, N, R} - -# define appropriate handlers for JSON3 interface -StructTypes.StructType(x::CategoricalValue) = StructTypes.StructType(unwrap(x)) -StructTypes.StructType(::Type{<:CategoricalValue{T}}) where {T} = StructTypes.StructType(T) -StructTypes.numbertype(::Type{<:CategoricalValue{T}}) where {T <: Number} = T -StructTypes.construct(::Type{T}, x::CategoricalValue{T}) where {T} = T(unwrap(x)) - -RecipesBase.@recipe function f(::Type{T}, v::T) where T <: CategoricalValue - level_strings = [map(string, levels(v)); missing] - ticks --> eachindex(level_strings) - v -> ismissing(v) ? length(level_strings) : Int(refcode(v)), - i -> level_strings[Int(i)] -end + CategoricalArray{Union{T, Missing}, N, R} \ No newline at end of file diff --git a/test/11_array.jl b/test/11_array.jl index 1b8fa582..3056d3ce 100644 --- a/test/11_array.jl +++ b/test/11_array.jl @@ -188,8 +188,16 @@ using CategoricalArrays: DefaultRefType, leveltype @test x[3] === CategoricalValue(x.pool, 1) @test_throws ArgumentError levels!(x, ["a"]) + # check that x is restored correctly when dropping levels is not allowed + @test x == ["a", "b", "b"] + @test levels(x) == ["b", "a"] + @test_throws ArgumentError levels!(x, ["e", "b"]) + @test_throws ArgumentError levels!(x, ["e", "a", "b", "a"]) + # once again check that x is restored correctly when dropping levels is not allowed + @test x == ["a", "b", "b"] + @test levels(x) == ["b", "a"] @test levels!(x, ["e", "a", "b"]) === x @test levels(x) == ["e", "a", "b"] diff --git a/test/12_missingarray.jl b/test/12_missingarray.jl index 0768b45a..d4221921 100644 --- a/test/12_missingarray.jl +++ b/test/12_missingarray.jl @@ -200,8 +200,16 @@ const ≅ = isequal @test levels(x) == ["b", "a"] @test_throws ArgumentError levels!(x, ["a"]) + # check that x is restored correctly when dropping levels is not allowed + @test x == ["a", "b", "b"] + @test levels(x) == ["b", "a"] + @test_throws ArgumentError levels!(x, ["e", "b"]) + @test_throws ArgumentError levels!(x, ["e", "a", "b", "a"]) + # once again check that x is restored correctly when dropping levels is not allowed + @test x == ["a", "b", "b"] + @test levels(x) == ["b", "a"] @test levels!(x, ["e", "a", "b"]) === x @test levels(x) == ["e", "a", "b"] @@ -216,13 +224,16 @@ const ≅ = isequal @test x[3] === CategoricalValue(x.pool, 3) @test levels(x) == ["e", "a", "b", "c"] + # check once more that x is restored correctly when dropping levels is not allowed @test_throws ArgumentError levels!(x, ["e", "c"]) + @test x == ["c", "b", "b"] + @test levels(x) == ["e", "a", "b", "c"] + # check that with allowed missings the absent levels are converted to missing @test levels!(x, ["e", "c"], allowmissing=true) === x @test levels(x) == ["e", "c"] @test x[1] === CategoricalValue(x.pool, 2) @test x[2] === missing @test x[3] === missing - @test levels(x) == ["e", "c"] push!(x, "e") @test length(x) == 4 diff --git a/test/13_arraycommon.jl b/test/13_arraycommon.jl index ed456832..77cbe593 100644 --- a/test/13_arraycommon.jl +++ b/test/13_arraycommon.jl @@ -3,12 +3,13 @@ using Test using Missings using Future: copy! using CategoricalArrays, DataAPI -using CategoricalArrays: DefaultRefType +using CategoricalArrays: DefaultRefType, pool using PooledArrays using JSON3 using StructTypes using RecipesBase using Plots +using SentinelArrays const ≅ = isequal const ≇ = !isequal @@ -2069,4 +2070,89 @@ end @test res.args[2].data == [1, 3, 4, 2] end +@testset "sizehint! tests and additional empty! tests" begin + x = categorical([1]) + @test sizehint!(x, 1000) === x + @test x == [1] + @test_throws MethodError empty!(categorical([1 2; 3 4])) + @test_throws MethodError sizehint!(categorical([1 2; 3 4])) +end + + +@testset "levels!() exceptions handling and rolling back to previous state" begin + orig = ["A", "B", "B", "C", "D", "B", "A"] + origmissing = convert(Vector{Union{String,Missing}}, orig) + origmissing[2] = missing + + @testset "throws if duplicate levels provided" begin + x = CategoricalArray(orig) + oldpool = pool(x) + @test_throws ArgumentError levels!(x, ["B", "A", "C", "D", "A"]) + @test x == orig + @test pool(x) == oldpool + @test levels(x) == ["A", "B", "C", "D"] + end + + @testset "can drop unused levels if element type is $(eltype(x0))" for x0 in (orig, origmissing) + x = CategoricalArray(x0) + levels!(x, ["E", "A", "B", "C", "D"]) + @test levels(x) == ["E", "A", "B", "C", "D"] + @test x === levels!(x, ["B", "A", "C", "D"]) + @test x ≅ x0 + @test levels(x) == ["B", "A", "C", "D"] + end + + @testset "CategoricalArray which cannot store missings" begin + x = CategoricalArray(orig) + @test levels(x) == ["A", "B", "C", "D"] + oldpool = pool(x) + @test_throws ArgumentError levels!(x, ["B", "A", "C"]) + # check that the x contents have not changed + @test x == orig + @test pool(x) === oldpool + @test levels(x) == ["A", "B", "C", "D"] + + # still throws even if allowmissing=true + @test_throws ArgumentError levels!(x, ["B", "A", "C"], allowmissing=true) + # check that the x contents have not changed + @test x == orig + @test pool(x) === oldpool + @test levels(x) == ["A", "B", "C", "D"] + end + + @testset "CategoricalArray which can store missing" begin + x = CategoricalArray(origmissing) + oldpool = pool(x) + @test levels(x) == ["A", "B", "C", "D"] + # throws if missings are not explicitly allowed + @test_throws ArgumentError levels!(x, ["B", "A", "C"]) + # check that the x contents have not changed + @test x ≅ origmissing + @test pool(x) === oldpool + @test levels(x) == ["A", "B", "C", "D"] + + @test x === levels!(x, ["B", "A", "C", "E"], allowmissing=true) + @test x ≅ ["A", missing, "B", "C", missing, "B", "A"] + @test levels(x) == ["B", "A", "C", "E"] + end + + @testset "interaction with ChainedVector" begin + x = ChainedVector([["a", "b"], ["c", "d", "e"]]) + @test CategoricalArray(x) == CategoricalArray{String}(x) == + CategoricalArray{Union{String, Missing}}(x) == x + @test copy!(CategoricalArray{String}(undef, 5), x) == + copyto!(CategoricalArray{String}(undef, 5), x) == + copyto!(CategoricalArray{String}(undef, 5), 1, x, 1, 5) == + x + + x .= "z" + y = categorical(["a", "b", "c", "d", "e"]) + @test copy!(x, y) == y + x .= "z" + @test copyto!(x, y) == y + x .= "z" + @test copyto!(x, 1, y, 1, 5) == y + end +end + end