diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 00000000..adee0ed1 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,6 @@ +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "monthly" \ No newline at end of file diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml index 7344a549..e628f26d 100644 --- a/.github/workflows/CompatHelper.yml +++ b/.github/workflows/CompatHelper.yml @@ -1,24 +1,39 @@ name: CompatHelper - on: schedule: - - cron: '00 00 * * *' - + - cron: 0 0 * * * + workflow_dispatch: +permissions: + contents: write + pull-requests: write jobs: - build: - runs-on: ${{ matrix.os }} - strategy: - matrix: - julia-version: [1.2.0] - julia-arch: [x86] - os: [ubuntu-latest] + CompatHelper: + runs-on: ubuntu-latest steps: - - uses: julia-actions/setup-julia@latest + - name: Check if Julia is already available in the PATH + id: julia_in_path + run: which julia + continue-on-error: true + - name: Install Julia, but only if it is not already available in the PATH + uses: julia-actions/setup-julia@v2 with: - version: ${{ matrix.julia-version }} - - name: Pkg.add("CompatHelper") - run: julia -e 'using Pkg; Pkg.add("CompatHelper")' - - name: CompatHelper.main() + version: '1' + arch: ${{ runner.arch }} + if: steps.julia_in_path.outcome != 'success' + - name: "Install CompatHelper" + run: | + import Pkg + name = "CompatHelper" + uuid = "aa819f21-2bde-4658-8897-bab36330d9b7" + version = "3" + Pkg.add(; name, uuid, version) + shell: julia --color=yes {0} + - name: "Run CompatHelper" + run: | + import CompatHelper + CompatHelper.main() + shell: julia --color=yes {0} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: julia -e 'using CompatHelper; CompatHelper.main()' + COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }} + # COMPATHELPER_PRIV: ${{ secrets.COMPATHELPER_PRIV }} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index aaeda107..aee70898 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,13 +12,15 @@ jobs: fail-fast: false matrix: version: - - '1.0' + # FIXME! Switch from 1.6 to 'min' once we require a higher minimum + # We can't switch yet as there is a method ambiguity for a depndency + # in version 1.6.0. + - '1.6' + - 'lts' - '1' # automatically expands to the latest stable 1.x release of Julia - - 'nightly' + - 'pre' os: - ubuntu-latest - - macOS-latest - - windows-latest arch: - x64 - x86 @@ -26,12 +28,12 @@ jobs: - os: macOS-latest arch: x86 steps: - - uses: actions/checkout@v2 - - uses: julia-actions/setup-julia@v1 + - uses: actions/checkout@v4 + - uses: julia-actions/setup-julia@v2 with: version: ${{ matrix.version }} arch: ${{ matrix.arch }} - - uses: actions/cache@v1 + - uses: actions/cache@v4 env: cache-name: cache-artifacts with: @@ -44,14 +46,14 @@ jobs: - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 - uses: julia-actions/julia-processcoverage@v1 - - uses: codecov/codecov-action@v1 + - uses: codecov/codecov-action@v5 with: file: lcov.info docs: name: Documentation runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - uses: julia-actions/julia-buildpkg@latest - uses: julia-actions/julia-docdeploy@latest env: diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 00000000..61842496 --- /dev/null +++ b/NEWS.md @@ -0,0 +1,48 @@ +# CategoricalArrays.jl v1.0.0 Release Notes + +## Breaking changes + +* `unique(::CategoricalArray)` and `levels(::CategoricalArray)` return + a `CategoricalArray` instead of unwrapping values, consistent with + `unique(::AbstractArray)` in Base and `levels(::AbstractArray)` in DataAPI + ([#358](https://github.com/JuliaData/CategoricalArrays.jl/pull/358), + [#425](https://github.com/JuliaData/CategoricalArrays.jl/pull/425)). + +* `cut` always closes the last interval on the right + ([#409](https://github.com/JuliaData/CategoricalArrays.jl/pull/409)). + +* `cut(x, breaks)` rounds breaks to generate shorter labels + ([#422](https://github.com/JuliaData/CategoricalArrays.jl/pull/422)). + +* `cut(x, ngroups)` takes breaks from actual values instead of using + quantile estimates which are generally longer + ([#416](https://github.com/JuliaData/CategoricalArrays.jl/pull/416)) + This only changes group labels, not their contents. + +* `T(::CategoricalArray{U})` and `convert(T, ::CategoricalArray{U})` + now consistently return an `Array{U}` for `T` in `Array`, `Vector`, `Matrix`. + This avoids creating `Array{<:CategoricalValue}` objects unless explicitly requested + ([#420](https://github.com/JuliaData/CategoricalArrays.jl/pull/420)). + + +* All deprecations have been removed + ([#419](https://github.com/JuliaData/CategoricalArrays.jl/pull/419)). + +## New features + +* Support reading from and writing to Arrow files + ([#415](https://github.com/JuliaData/CategoricalArrays.jl/pull/415)). + +* Improve performance of `recode` + ([#407](https://github.com/JuliaData/CategoricalArrays.jl/pull/407)). + +* Support weighted quantiles in `cut` + ([#423](https://github.com/JuliaData/CategoricalArrays.jl/pull/423)). + +## Bug fixes + +* Fix performance regression on Julia 1.11 and above + ([#418](https://github.com/JuliaData/CategoricalArrays.jl/pull/418)). + +* Fix `cut` corner cases with duplicated breaks + ([#410](https://github.com/JuliaData/CategoricalArrays.jl/pull/410)). diff --git a/Project.toml b/Project.toml index 5846e340..83d5ba30 100644 --- a/Project.toml +++ b/Project.toml @@ -1,8 +1,9 @@ name = "CategoricalArrays" uuid = "324d7699-5711-5eae-9e2f-1d82baa6b597" -version = "0.10.8" +version = "1.0.0" [deps] +Compat = "34da2185-b29b-5c13-b0c7-acf172513d20" DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" Future = "9fa8497b-333b-5362-9e8d-4d0656e87820" Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" @@ -12,18 +13,24 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" [weakdeps] +Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01" +StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" SentinelArrays = "91c51154-3ec4-41a3-a24f-3f23e20d615c" StructTypes = "856f2bd8-1eba-4b0a-8007-ebc267875bd4" [extensions] +CategoricalArraysArrowExt = "Arrow" CategoricalArraysJSONExt = "JSON" CategoricalArraysRecipesBaseExt = "RecipesBase" +CategoricalArraysStatsBaseExt = "StatsBase" CategoricalArraysSentinelArraysExt = "SentinelArrays" CategoricalArraysStructTypesExt = "StructTypes" [compat] +Arrow = "2" +Compat = "3.47, 4.10" DataAPI = "1.6" JSON = "0.15, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21" JSON3 = "1.1.2" @@ -31,20 +38,23 @@ Missings = "0.4.3, 1" RecipesBase = "1.1" Requires = "1" SentinelArrays = "1" +Statistics = "1" +StatsBase = "0.15, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.30, 0.31, 0.32, 0.33, 0.34" StructTypes = "1" -julia = "1" +julia = "1.6" [extras] +Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" -Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01" +RecipesPipeline = "01d81517-befc-4cb6-b9ec-a95719d0359c" SentinelArrays = "91c51154-3ec4-41a3-a24f-3f23e20d615c" +StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" StructTypes = "856f2bd8-1eba-4b0a-8007-ebc267875bd4" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Dates", "JSON", "JSON3", "Plots", "PooledArrays", - "RecipesBase", "SentinelArrays", "StructTypes", "Test"] +test = ["Arrow", "Dates", "JSON", "JSON3", "PooledArrays", "RecipesBase", "RecipesPipeline", "SentinelArrays", "StatsBase", "StructTypes", "Test"] diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index 5c2ae42b..bf12f7c9 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -55,11 +55,11 @@ SUITE["many levels"]["CategoricalArray(::Vector{String})"] = a = rand([@sprintf("id%010d", k) for k in 1:1000], 10000) ca = CategoricalArray(a) -levs = levels(ca) +levs = unwrap.(levels(ca)) SUITE["many levels"]["levels! with original levels"] = @benchmarkable levels!(ca, levs) -levs = reverse(levels(ca)) +levs = reverse(unwrap.(levels(ca))) SUITE["many levels"]["levels! with resorted levels"] = @benchmarkable levels!(ca, levs) diff --git a/docs/Project.toml b/docs/Project.toml index 1a6d3094..1814eb33 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -2,4 +2,4 @@ Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" [compat] -Documenter = "~0.27" +Documenter = "1" diff --git a/docs/make.jl b/docs/make.jl index 6a5e3be8..1b260579 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -15,8 +15,7 @@ makedocs( "Implementation details" => "implementation.md", "API index" => "apiindex.md" ], - checkdocs = :exports, - strict=true + checkdocs = :exports ) deploydocs( diff --git a/docs/src/using.md b/docs/src/using.md index 9790e8cf..24c452b0 100644 --- a/docs/src/using.md +++ b/docs/src/using.md @@ -20,7 +20,7 @@ By default, the levels are lexically sorted, which is clearly not correct in our ```jldoctest using julia> levels(x) -3-element Vector{String}: +3-element CategoricalArray{String,1,UInt32}: "Middle" "Old" "Young" @@ -68,7 +68,7 @@ To get rid of the `"Old"` group, just call the [`droplevels!`](@ref) function: ```jldoctest using julia> levels(x) -3-element Vector{String}: +3-element CategoricalArray{String,1,UInt32}: "Young" "Middle" "Old" @@ -81,7 +81,7 @@ julia> droplevels!(x) "Young" julia> levels(x) -2-element Vector{String}: +2-element CategoricalArray{String,1,UInt32}: "Young" "Middle" @@ -139,7 +139,7 @@ Levels still need to be reordered manually: ```jldoctest using julia> levels(y) -3-element Vector{String}: +3-element CategoricalArray{String,1,UInt32}: "Middle" "Old" "Young" @@ -251,7 +251,7 @@ julia> xy = vcat(x, y) "Middle" julia> levels(xy) -3-element Vector{String}: +3-element CategoricalArray{String,1,UInt32}: "Young" "Middle" "Old" @@ -263,7 +263,7 @@ true Likewise, assigning a `CategoricalValue` from `y` to an entry in `x` expands the levels of `x` with all levels from `y`, *respecting the ordering of levels of both vectors if possible*: ```jldoctest using julia> levels(x) -2-element Vector{String}: +2-element CategoricalArray{String,1,UInt32}: "Middle" "Old" @@ -271,7 +271,7 @@ julia> x[1] = y[1] CategoricalValue{String, UInt32} "Young" (1/2) julia> levels(x) -3-element Vector{String}: +3-element CategoricalArray{String,1,UInt32}: "Young" "Middle" "Old" @@ -296,7 +296,7 @@ julia> ab = vcat(a, b) "c" julia> levels(ab) -3-element Vector{String}: +3-element CategoricalArray{String,1,UInt32}: "a" "b" "c" @@ -320,7 +320,7 @@ julia> ab2 = vcat(a, b) "c" julia> levels(ab2) -3-element Vector{String}: +3-element CategoricalArray{String,1,UInt32}: "a" "b" "c" diff --git a/ext/CategoricalArraysArrowExt.jl b/ext/CategoricalArraysArrowExt.jl new file mode 100644 index 00000000..811870d2 --- /dev/null +++ b/ext/CategoricalArraysArrowExt.jl @@ -0,0 +1,72 @@ +module CategoricalArraysArrowExt + +using CategoricalArrays +import Arrow +import Arrow: ArrowTypes + +const CATARRAY_ARROWNAME = Symbol("JuliaLang.CategoricalArrays.CategoricalArray") +ArrowTypes.arrowname(::Type{<:CategoricalValue}) = CATARRAY_ARROWNAME +ArrowTypes.arrowmetadata(::Type{CategoricalValue{T, R}}) where {T, R} = string(R) +ArrowTypes.ArrowType(::Type{<:CategoricalValue{T}}) where {T} = T +ArrowTypes.toarrow(x::CategoricalValue) = unwrap(x) + +ArrowTypes.arrowname(::Type{Union{<:CategoricalValue, Missing}}) = CATARRAY_ARROWNAME +ArrowTypes.arrowmetadata(::Type{Union{CategoricalValue{T, R}, Missing}}) where {T, R} = + string(R) + +const REFTYPES = Dict(string(T) => T for T in (Int128, Int16, Int32, Int64, Int8, UInt128, + UInt16, UInt32, UInt64, UInt8)) +function ArrowTypes.JuliaType(::Val{CATARRAY_ARROWNAME}, + ::Type{S}, meta::String) where S + R = REFTYPES[meta] + return CategoricalValue{S, R} +end + +for (MV, MT) in ((:V, :T), (:(Union{V,Missing}), :(Union{T,Missing}))) + @eval begin + function Arrow.DictEncoding{$MV,S,A}(id, data::Arrow.List{U, O, B}, + isOrdered, metadata) where + {T, R, V<:CategoricalValue{T,R}, S, O, A, B, U} + newdata = Arrow.List{$MT,O,B}(data.arrow, data.validity, data.offsets, + data.data, data.ℓ, data.metadata) + levels = Missing <: $MT ? collect(skipmissing(newdata)) : newdata + catdata = CategoricalVector{$MT,R}(newdata, levels=levels) + return Arrow.DictEncoding{$MV,S,typeof(catdata)}(id, catdata, + isOrdered, metadata) + end + + function Arrow.DictEncoding{$MV,S,A}(id, data::Arrow.Primitive{U, B}, + isOrdered, metadata) where + {T, R, V<:CategoricalValue{T,R}, S, A, B, U} + newdata = Arrow.Primitive{$MT,B}(data.arrow, data.validity, data.data, + data.ℓ, data.metadata) + levels = Missing <: $MT ? collect(skipmissing(newdata)) : newdata + catdata = CategoricalVector{$MT,R}(newdata, levels=levels) + return Arrow.DictEncoding{$MV,S,typeof(catdata)}(id, catdata, + isOrdered, metadata) + end + end +end + +function Base.copy(x::Arrow.DictEncoded{V}) where {T, R, V<:CategoricalValue{T, R}} + pool = CategoricalPool{T,R}(x.encoding.data) + inds = x.indices + refs = similar(inds, R) + refs .= inds .+ one(R) + return CategoricalVector{T}(refs, pool) +end + +function Base.copy(x::Arrow.DictEncoded{Union{Missing,V}}) where + {T, R, V<:CategoricalValue{T, R}} + ismissing(x.encoding.data[1]) || + throw(ErrorException("`missing` must be the first value in a " * + "`CategoricalArray` pool")) + levels = collect(skipmissing(x.encoding.data)) + pool = CategoricalPool{T,R}(levels) + inds = x.indices + refs = similar(inds, R) + refs .= inds + return CategoricalVector{Union{T,Missing}}(refs, pool) +end + +end diff --git a/ext/CategoricalArraysRecipesBaseExt.jl b/ext/CategoricalArraysRecipesBaseExt.jl index 2642f838..656f3e3d 100644 --- a/ext/CategoricalArraysRecipesBaseExt.jl +++ b/ext/CategoricalArraysRecipesBaseExt.jl @@ -9,7 +9,7 @@ else end RecipesBase.@recipe function f(::Type{T}, v::T) where T <: CategoricalValue - level_strings = [map(string, levels(v)); missing] + level_strings = [map(string, CategoricalArrays._levels(v)); missing] ticks --> eachindex(level_strings) v -> ismissing(v) ? length(level_strings) : Int(CategoricalArrays.refcode(v)), i -> level_strings[Int(i)] diff --git a/ext/CategoricalArraysStatsBaseExt.jl b/ext/CategoricalArraysStatsBaseExt.jl new file mode 100644 index 00000000..8cbd5c61 --- /dev/null +++ b/ext/CategoricalArraysStatsBaseExt.jl @@ -0,0 +1,13 @@ +module CategoricalArraysStatsBaseExt + +if isdefined(Base, :get_extension) + import CategoricalArrays: _wquantile + using StatsBase +else + import ..CategoricalArrays: _wquantile + using ..StatsBase +end + +_wquantile(x::AbstractArray, w::AbstractWeights, p::AbstractVector) = quantile(x, w, p) + +end diff --git a/src/CategoricalArrays.jl b/src/CategoricalArrays.jl index 214a5d17..f44b3c2f 100644 --- a/src/CategoricalArrays.jl +++ b/src/CategoricalArrays.jl @@ -11,6 +11,9 @@ module CategoricalArrays import DataAPI: unwrap export unwrap + using Compat + @compat public default_formatter, numbered_formatter + using DataAPI using Missings using Printf @@ -32,17 +35,17 @@ module CategoricalArrays include("extras.jl") include("recode.jl") - include("deprecated.jl") - if !isdefined(Base, :get_extension) using Requires: @require end @static if !isdefined(Base, :get_extension) function __init__() + @require Arrow="69666777-d1a9-59fb-9406-91d4454c9d45" include("../ext/CategoricalArraysArrowExt.jl") @require JSON="682c06a0-de6a-54ab-a142-c8b1cf79cde6" include("../ext/CategoricalArraysJSONExt.jl") @require RecipesBase="3cdcf5f2-1ef4-517c-9805-6587b60abb01" include("../ext/CategoricalArraysRecipesBaseExt.jl") @require SentinelArrays="91c51154-3ec4-41a3-a24f-3f23e20d615c" include("../ext/CategoricalArraysSentinelArraysExt.jl") + @require StatsBase="2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" include("../ext/CategoricalArraysStatsBaseExt.jl") @require StructTypes="856f2bd8-1eba-4b0a-8007-ebc267875bd4" include("../ext/CategoricalArraysStructTypesExt.jl") end end diff --git a/src/array.jl b/src/array.jl index ffbf66b8..4d47e82c 100644 --- a/src/array.jl +++ b/src/array.jl @@ -1,7 +1,8 @@ ## Code for CategoricalArray -import Base: Array, convert, collect, copy, getindex, setindex!, similar, size, - unique, vcat, in, summary, float, complex, copyto! +import Base: Array, Vector, Matrix, convert, collect, copy, getindex, + setindex!, similar, size, + unique, unique!, vcat, in, summary, float, complex, copyto! # Used for keyword argument default value _isordered(x::AbstractCategoricalArray) = isordered(x) @@ -160,9 +161,8 @@ function CategoricalArray{T, N, R}(::UndefInitializer, dims::NTuple{N,Int}; U = leveltype(nonmissingtype(T)) S = T >: Missing ? Union{U, Missing} : U check_supported_eltype(S, T) - V = CategoricalValue{U, R} levs = levels === nothing ? U[] : collect(U, levels) - CategoricalArray{S, N}(zeros(R, dims), CategoricalPool{U, R, V}(levs, ordered)) + CategoricalArray{S, N}(zeros(R, dims), CategoricalPool{U, R}(levs, ordered)) end CategoricalArray{T, N}(::UndefInitializer, dims::NTuple{N,Int}; @@ -240,7 +240,7 @@ function CategoricalArray{T, N, R}(A::CategoricalArray{S, N, Q}; catch err err isa LevelsException || rethrow(err) throw(ArgumentError("encountered value(s) not in specified `levels`: " * - "$(setdiff(CategoricalArrays.levels(res), levels))")) + "$(setdiff(_levels(res), levels))")) end end return res @@ -359,18 +359,18 @@ function _convert(::Type{CategoricalArray{T, N, R}}, A::AbstractArray{S, N}; copyto!(res, A) if levels !== nothing - CategoricalArrays.levels(res) == levels || + _levels(res) == levels || throw(ArgumentError("encountered value(s) not in specified `levels`: " * - "$(setdiff(CategoricalArrays.levels(res), levels))")) + "$(setdiff(_levels(res), levels))")) else # if order is defined for level type, automatically apply it L = leveltype(res) if Base.OrderStyle(L) isa Base.Ordered - levels!(res, sort(CategoricalArrays.levels(res))) + levels!(res, sort(_levels(res))) elseif hasmethod(isless, (L, L)) # isless may throw an error, e.g. for AbstractArray{T} of unordered T try - levels!(res, sort(CategoricalArrays.levels(res))) + levels!(res, sort(_levels(res))) catch e e isa MethodError || rethrow(e) end @@ -383,7 +383,7 @@ end # From CategoricalArray (preserve levels, ordering and R) function convert(::Type{CategoricalArray{T, N, R}}, A::CategoricalArray{S, N}) where {S, T, N, R} if length(A.pool) > typemax(R) - throw(LevelsException{T, R}(levels(A)[typemax(R)+1:end])) + throw(LevelsException{T, R}(_levels(A)[typemax(R)+1:end])) end if !(T >: Missing) && S >: Missing && any(iszero, A.refs) @@ -411,6 +411,12 @@ convert(::Type{CategoricalArray{T, N}}, A::CategoricalArray{T, N}) where {T, N} convert(::Type{CategoricalArray{T}}, A::CategoricalArray{T}) where {T} = A convert(::Type{CategoricalArray}, A::CategoricalArray) = A +convert(::Type{Array{S, N}}, A::CatArrOrSub{T, N}) where {S, T, N} = + collect(S, A) +convert(::Type{Array}, A::CatArrOrSub) = unwrap.(A) +convert(::Type{Vector}, A::CatArrOrSub) = unwrap.(A) +convert(::Type{Matrix}, A::CatArrOrSub) = unwrap.(A) + function Base.:(==)(A::CategoricalArray{S}, B::CategoricalArray{T}) where {S, T} if size(A) != size(B) return false @@ -461,7 +467,7 @@ size(A::CategoricalArray) = size(A.refs) Base.IndexStyle(::Type{<:CategoricalArray}) = IndexLinear() function update_refs!(A::CategoricalArray, newlevels::AbstractVector) - oldlevels = levels(A) + oldlevels = _levels(A) levelsmap = similar(A.refs, length(oldlevels)+1) # 0 maps to a missing value levelsmap[1] = 0 @@ -479,7 +485,7 @@ function merge_pools!(A::CatArrOrSub, updaterefs::Bool=true, updatepool::Bool=true) newlevels, ordered = merge_pools(pool(A), pool(B)) - oldlevels = levels(A) + oldlevels = _levels(A) pA = A isa SubArray ? parent(A) : A ordered!(pA, ordered) # If A's levels are an ordered superset of new (merged) pool, no need to recompute refs @@ -538,8 +544,8 @@ function copyto!(dest::CatArrOrSub{T, N, R}, dstart::Integer, # try converting src to dest type to avoid partial copy corruption of dest # in the event that the src cannot be copied into dest - slevs = convert(Vector{T}, levels(src)) - dlevs = levels(dest) + slevs = convert(Vector{T}, _levels(src)) + dlevs = _levels(dest) if eltype(src) >: Missing && !(eltype(dest) >: Missing) && !all(x -> x > 0, srefs) throw(MissingException("cannot copy array with missing values to an array with element type $T")) end @@ -592,7 +598,7 @@ function copyto!(dest::CatArrOrSub{T1, N, R}, dstart::Integer, return invoke(copyto!, Tuple{AbstractArray, Integer, AbstractArray, Integer, Integer}, dest, dstart, src, sstart, n) end - newdestlevs = destlevs = copy(levels(dest)) # copy since we need original levels below + newdestlevs = destlevs = copy(_levels(dest)) # copy since we need original levels below srclevsnm = T2 >: Missing ? setdiff(srclevs, [missing]) : srclevs if !(srclevsnm ⊆ destlevs) # if order is defined for level type, automatically apply it @@ -702,7 +708,7 @@ While this will reduce memory use, this function is type-unstable, which can aff performance inside the function where the call is made. Therefore, use it with caution. """ function compress(A::CategoricalArray{T, N}) where {T, N} - R = reftype(length(levels(A.pool))) + R = reftype(length(_levels(A.pool))) convert(CategoricalArray{T, N, R}, A) end @@ -720,11 +726,11 @@ decompress(A::CategoricalArray{T, N}) where {T, N} = convert(CategoricalArray{T, N, DefaultRefType}, A) function vcat(A::CategoricalArray...) - ordered = any(isordered, A) && all(a->isordered(a) || isempty(levels(a)), A) - newlevels, ordered = mergelevels(ordered, map(levels, A)...) + ordered = any(isordered, A) && all(a->isordered(a) || isempty(_levels(a)), A) + newlevels, ordered = mergelevels(ordered, map(_levels, A)...) refsvec = map(A) do a - ii = convert(Vector{Int}, indexin(levels(a.pool), newlevels)) + ii = convert(Vector{Int}, indexin(_levels(a.pool), newlevels)) [x==0 ? 0 : ii[x] for x in a.refs]::Array{Int,ndims(a)} end @@ -762,23 +768,25 @@ This may include levels which do not actually appear in the data `missing` will be included only if it appears in the data and `skipmissing=false` is passed. -The returned vector is an internal field of `x` which must not be mutated +The returned vector is owned by `x` and must not be mutated as doing so would corrupt it. """ -@inline function DataAPI.levels(A::CatArrOrSub{T}; skipmissing::Bool=true) where T +@inline function DataAPI.levels(A::CatArrOrSub; skipmissing::Bool=true) if eltype(A) >: Missing && !skipmissing if any(==(0), refs(A)) - T[levels(pool(A)); missing] + eltype(A)[levels(pool(A)); missing] else - convert(Vector{T}, levels(pool(A))) + levels_missing(pool(A)) end else levels(pool(A)) end end +_levels(A::CatArrOrSub) = _levels(pool(A)) + """ - levels!(A::CategoricalArray, newlevels::Vector; allowmissing::Bool=false) + levels!(A::CategoricalArray, newlevels::AbstractVector; allowmissing::Bool=false) Set the levels categorical array `A`. The order of appearance of levels will be respected by [`levels`](@ref DataAPI.levels), which may affect display of results in some operations; if `A` is @@ -791,14 +799,8 @@ entries corresponding to omitted levels will be set to `missing`. Else, `newlevels` must include all levels which appear in the data. """ function levels!(A::CategoricalArray{T, N, R}, newlevels::AbstractVector; - allowmissing::Bool=false, - allow_missing::Union{Bool, Nothing}=nothing) where {T, N, R} - if allow_missing !== nothing - Base.depwarn("allow_missing argument is deprecated, use allowmissing instead", - :levels!) - allowmissing = allow_missing - end - (levels(A) == newlevels) && return A # nothing to do + allowmissing::Bool=false) where {T, N, R} + (_levels(A) == newlevels) && return A # nothing to do # map each new level to its ref code newlv2ref = Dict{eltype(newlevels), Int}() @@ -813,7 +815,7 @@ function levels!(A::CategoricalArray{T, N, R}, newlevels::AbstractVector; end # map each old ref code to new ref code (or 0 if no such level) - oldlevels = levels(pool(A)) + oldlevels = _levels(pool(A)) oldref2newref = fill(0, length(oldlevels) + 1) for (i, lv) in enumerate(oldlevels) oldref2newref[i + 1] = get(newlv2ref, lv, 0) @@ -868,31 +870,36 @@ function levels!(A::CategoricalArray{T, N, R}, newlevels::AbstractVector; return A end -function _unique(::Type{S}, - refs::AbstractArray{T}, - pool::CategoricalPool) where {S, T<:Integer} - nlevels = length(levels(pool)) + 1 - order = fill(0, nlevels) # 0 indicates not seen - # If we don't track missings, short-circuit even if none has been seen - count = S >: Missing ? 0 : 1 - @inbounds for i in refs - if order[i + 1] == 0 - count += 1 - order[i + 1] = count - count == nlevels && break +# return unique refs (each value is unique) in the order of appearance in `refs` +# equivalent to fallback Base.unique() implementation, +# but short-circuits once references to all levels are encountered +function _uniquerefs(A::CatArrOrSub{T}) where T + arefs = refs(A) + res = similar(arefs, 0) + nlevels = length(_levels(A)) + maxunique = nlevels + (T >: Missing ? 1 : 0) + seen = fill(false, nlevels + 1) # always +1 for 0 (missing ref) + @inbounds for ref in arefs + if !seen[ref + 1] + push!(res, ref) + seen[ref + 1] = true + (length(res) == maxunique) && break end end - S[i == 1 ? missing : levels(pool)[i - 1] for i in sortperm(order) if order[i] != 0] + return res end -""" - unique(A::CategoricalArray) +unique(A::CatArrOrSub{T}) where T = + CategoricalVector{T}(_uniquerefs(A), copy(pool(A))) -Return levels which appear in `A` in their order of appearance. -This function is significantly slower than [`levels`](@ref DataAPI.levels) -since it needs to check whether levels are used or not. -""" -unique(A::CategoricalArray{T}) where {T} = _unique(T, A.refs, A.pool) +function unique!(A::CategoricalVector) + urefs = _uniquerefs(A) + if length(urefs) != length(A) + resize!(A.refs, length(urefs)) + copyto!(A.refs, urefs) + end + return A +end """ droplevels!(A::CategoricalArray) @@ -902,7 +909,7 @@ returned by [`levels`](@ref DataAPI.levels)). """ function droplevels!(A::CategoricalArray) arefs = refs(A) - nlevels = length(levels(A)) + 1 # +1 for missing + nlevels = length(_levels(A)) + 1 # +1 for missing seen = fill(false, nlevels) seen[1] = true # assume that missing is always observed to simplify checks nseen = 1 @@ -915,7 +922,7 @@ function droplevels!(A::CategoricalArray) end # replace the pool - A.pool = typeof(pool(A))(@inbounds(levels(A)[view(seen, 2:nlevels)]), isordered(A)) + A.pool = typeof(pool(A))(@inbounds(_levels(A)[view(seen, 2:nlevels)]), isordered(A)) # recode refs to keep only the seen ones (optimized version of update_refs!()) seen[1] = false # to start levelsmap from 0 levelsmap = cumsum(seen) @@ -1032,7 +1039,7 @@ end ordered=_isordered(A), compress::Bool=false) where {T, N, R} # @inline is needed so that return type is inferred when compress is not provided - RefType = compress ? reftype(length(CategoricalArrays.levels(A))) : R + RefType = compress ? reftype(length(_levels(A))) : R CategoricalArray{T, N, RefType}(A, levels=levels, ordered=ordered) end @@ -1045,13 +1052,15 @@ function in(x::CategoricalValue, y::CategoricalArray{T, N, R}) where {T, N, R} if x.pool === y.pool return refcode(x) in y.refs else - ref = get(y.pool, levels(x.pool)[refcode(x)], zero(R)) + ref = get(y.pool, _levels(x.pool)[refcode(x)], zero(R)) return ref != 0 ? ref in y.refs : false end end -Array(A::CategoricalArray{T}) where {T} = Array{T}(A) -collect(A::CategoricalArray) = copy(A) +Array(A::CatArrOrSub{T}) where {T} = Array{T}(A) +Vector(A::CatArrOrSub{T}) where {T} = Vector{T}(A) +Matrix(A::CatArrOrSub{T}) where {T} = Matrix{T}(A) +collect(A::CatArrOrSub) = copy(A) # Defined for performance collect(x::Base.SkipMissing{<: CatArrOrSub{T}}) where {T} = @@ -1121,7 +1130,7 @@ function Base.sort!(v::CategoricalVector; levs = eltype(v) >: Missing ? eltype(v)[i == 0 ? missing : CategoricalValue(v.pool, i) for i in 0:length(v.pool)] : eltype(v)[CategoricalValue(v.pool, i) for i in 1:length(v.pool)] - sortedlevs = sort!(Vector(view(levs, seen)), order=ord) + sortedlevs = sort!(Vector{eltype(levs)}(view(levs, seen)), order=ord) levelsmap = something.(indexin(sortedlevs, levs)) j = 0 refs = v.refs diff --git a/src/deprecated.jl b/src/deprecated.jl deleted file mode 100644 index 667b2923..00000000 --- a/src/deprecated.jl +++ /dev/null @@ -1,18 +0,0 @@ -function index(pool::CategoricalPool) - throw(ErrorException("CategoricalArrays.index(pool::CategoricalPool) is deprecated: " * - "use levels(pool) instead")) -end -function order(pool::CategoricalPool) - throw(ErrorException("CategoricalArrays.index(pool::CategoricalPool) is deprecated: " * - "use 1:length(levels(pool)) instead")) -end - -function categorical(A::AbstractArray, compress::Bool; kwargs...) - throw(ErrorException("categorical(A::AbstractArray, compress, kwargs...) is deprecated: " * - "use categorical(A, compress=compress, kwargs...) instead.")) -end - -import Base: get - -@deprecate get(x::CategoricalValue) DataAPI.unwrap(x) -@deprecate CategoricalValue(i::Integer, pool::CategoricalPool) pool[i] \ No newline at end of file diff --git a/src/extras.jl b/src/extras.jl index 137875b8..910c6e46 100644 --- a/src/extras.jl +++ b/src/extras.jl @@ -9,11 +9,14 @@ function fill_refs!(refs::AbstractArray, X::AbstractArray, @inbounds for i in eachindex(X) x = X[i] - if ismissing(x) + if x isa Number && isnan(x) + throw(ArgumentError("NaN values are not allowed in input vector")) + elseif ismissing(x) refs[i] = 0 - elseif extend === true && x == upper + elseif isequal(x, upper) refs[i] = n-1 - elseif extend !== true && !(lower <= x < upper) + elseif extend !== true && + !((isless(lower, x) || isequal(x, lower)) && isless(x, upper)) extend === missing || throw(ArgumentError("value $x (at index $i) does not fall inside the breaks: " * "adapt them manually, or pass extend=true or extend=missing")) @@ -24,43 +27,101 @@ function fill_refs!(refs::AbstractArray, X::AbstractArray, end end +if VERSION >= v"1.10" + const CUT_FMT = Printf.Format("%.*g") +end + +""" + CategoricalArrays.default_formatter(from, to, i::Integer; + leftclosed::Bool, rightclosed::Bool, + sigdigits::Integer) + +Provide the default label format for the `cut(x, breaks)` method, +which is `"[from, to)"` if `leftclosed` is `true` and `"[from, to)"` otherwise. + +If they are floating points values, breaks are turned into to strings using +`@sprintf("%.*g", sigdigits, break)` +(or `to` using `@sprintf("%.*g", sigdigits, break)` for the last break). +""" +function default_formatter(from, to, i::Integer; + leftclosed::Bool, rightclosed::Bool, + sigdigits::Integer) + @static if VERSION >= v"1.10" + from_str = from isa AbstractFloat ? + Printf.format(CUT_FMT, sigdigits, from) : + string(from) + to_str = to isa AbstractFloat ? + Printf.format(CUT_FMT, sigdigits, to) : + string(to) + else + from_str = from isa AbstractFloat ? + Printf.format(Printf.Format("%.$(sigdigits)g"), from) : + string(from) + to_str = to isa AbstractFloat ? + Printf.format(Printf.Format("%.$(sigdigits)g"), to) : + string(to) + end + string(leftclosed ? "[" : "(", from_str, ", ", to_str, rightclosed ? "]" : ")") +end + """ - default_formatter(from, to, i; leftclosed, rightclosed) + CategoricalArrays.numbered_formatter(from, to, i::Integer; + leftclosed::Bool, rightclosed::Bool, + sigdigits::Integer) + +Provide the default label format for the `cut(x, ngroups)` method +when `allowempty=true`, which is `"i: [from, to)"` if `leftclosed` +is `true` and `"i: [from, to)"` otherwise. -Provide the default label format for the `cut(x, breaks)` method. +If they are floating points values, breaks are turned into to strings using +`@sprintf("%.*g", sigdigits, breaks)` +(or `to` using `@sprintf("%.*g", sigdigits, break)` for the last break). """ -default_formatter(from, to, i; leftclosed, rightclosed) = - string(leftclosed ? "[" : "(", from, ", ", to, rightclosed ? "]" : ")") +numbered_formatter(from, to, i::Integer; + leftclosed::Bool, rightclosed::Bool, + sigdigits::Integer) = + string(i, ": ", + default_formatter(from, to, i, leftclosed=leftclosed, rightclosed=rightclosed, + sigdigits=sigdigits)) @doc raw""" cut(x::AbstractArray, breaks::AbstractVector; labels::Union{AbstractVector,Function}, + sigdigits::Integer=3, extend::Union{Bool,Missing}=false, allowempty::Bool=false) Cut a numeric array into intervals at values `breaks` and return an ordered `CategoricalArray` indicating -the interval into which each entry falls. Intervals are of the form `[lower, upper)`, -i.e. the lower bound is included and the upper bound is excluded, except -if `extend=true` the last interval, which is then closed on both ends, -i.e. `[lower, upper]`. +the interval into which each entry falls. Intervals are of the form `[lower, upper)` +(closed on the left), i.e. the lower bound is included and the upper bound is excluded, except +the last interval, which is closed on both ends, i.e. `[lower, upper]`. If `x` accepts missing values (i.e. `eltype(x) >: Missing`) the returned array will also accept them. +!!! note + For floating point data, breaks may be rounded to `sigdigits` significant digits + when generating interval labels, meaning that they may not reflect exactly the cutpoints + used. + # Keyword arguments * `extend::Union{Bool, Missing}=false`: when `false`, an error is raised if some values in `x` fall outside of the breaks; when `true`, breaks are automatically added to include - all values in `x`, and the upper bound is included in the last interval; when `missing`, - values outside of the breaks generate `missing` entries. + all values in `x`; when `missing`, values outside of the breaks generate `missing` entries. * `labels::Union{AbstractVector, Function}`: a vector of strings, characters - or numbers giving the names to use for - the intervals; or a function `f(from, to, i; leftclosed, rightclosed)` that generates + or numbers giving the names to use for the intervals; or a function + `f(from, to, i::Integer; leftclosed::Bool, rightclosed::Bool, sigdigits::Integer)` that generates the labels from the left and right interval boundaries and the group index. Defaults to - `"[from, to)"` (or `"[from, to]"` for the rightmost interval if `extend == true`). -* `allowempty::Bool=false`: when `false`, an error is raised if some breaks appear - multiple times, generating empty intervals; when `true`, duplicate breaks are allowed - and the intervals they generate are kept as unused levels - (but duplicate labels are not allowed). + [`CategoricalArrays.default_formatter`](@ref), giving `"[from, to)"` (or `"[from, to]"` + for the rightmost interval if `extend == true`). +* `sigdigits::Integer=3`: the minimum number of significant digits to use in labels. + This value is increased automatically if necessary so that rounded breaks are unique. + Only used for floating point types and when `labels` is a function, in which case it + is passed to it as a keyword argument. +* `allowempty::Bool=false`: when `false`, an error is raised if some breaks other than + the last one appear multiple times, generating empty intervals; when `true`, + duplicate breaks are allowed and the intervals they generate are kept as + unused levels (but duplicate labels are not allowed). # Examples ```jldoctest @@ -68,19 +129,19 @@ julia> using CategoricalArrays julia> cut(-1:0.5:1, [0, 1], extend=true) 5-element CategoricalArray{String,1,UInt32}: - "[-1.0, 0.0)" - "[-1.0, 0.0)" - "[0.0, 1.0]" - "[0.0, 1.0]" - "[0.0, 1.0]" + "[-1, 0)" + "[-1, 0)" + "[0, 1]" + "[0, 1]" + "[0, 1]" julia> cut(-1:0.5:1, 2) 5-element CategoricalArray{String,1,UInt32}: - "Q1: [-1.0, 0.0)" - "Q1: [-1.0, 0.0)" - "Q2: [0.0, 1.0]" - "Q2: [0.0, 1.0]" - "Q2: [0.0, 1.0]" + "[-1, 0)" + "[-1, 0)" + "[0, 1]" + "[0, 1]" + "[0, 1]" julia> cut(-1:0.5:1, 2, labels=["A", "B"]) 5-element CategoricalArray{String,1,UInt32}: @@ -88,7 +149,7 @@ julia> cut(-1:0.5:1, 2, labels=["A", "B"]) "A" "B" "B" - "B" + "B" julia> cut(-1:0.5:1, 2, labels=[-0.5, +0.5]) 5-element CategoricalArray{Float64,1,UInt32}: @@ -103,45 +164,40 @@ fmt (generic function with 1 method) julia> cut(-1:0.5:1, 3, labels=fmt) 5-element CategoricalArray{String,1,UInt32}: - "grp 1 (-1.0//-0.3333333333333335)" - "grp 1 (-1.0//-0.3333333333333335)" - "grp 2 (-0.3333333333333335//0.33333333333333326)" - "grp 3 (0.33333333333333326//1.0)" - "grp 3 (0.33333333333333326//1.0)" + "grp 1 (-1.0//0.0)" + "grp 1 (-1.0//0.0)" + "grp 2 (0.0//0.5)" + "grp 3 (0.5//1.0)" + "grp 3 (0.5//1.0)" ``` """ @inline function cut(x::AbstractArray, breaks::AbstractVector; extend::Union{Bool, Missing}=false, labels::Union{AbstractVector{<:SupportedTypes},Function}=default_formatter, - allowmissing::Union{Bool, Nothing}=nothing, - allow_missing::Union{Bool, Nothing}=nothing, + sigdigits::Integer=3, allowempty::Bool=false) - if allow_missing !== nothing - Base.depwarn("allow_missing argument is deprecated, use extend=missing instead", - :cut) - extend = missing - end - if allowmissing !== nothing - Base.depwarn("allowmissing argument is deprecated, use extend=missing instead", - :cut) - extend = missing - end - return _cut(x, breaks, extend, labels, allowempty) + return _cut(x, breaks, extend, labels, sigdigits, allowempty) end # Separate function for inferability (thanks to inlining of cut) function _cut(x::AbstractArray{T, N}, breaks::AbstractVector, extend::Union{Bool, Missing}, labels::Union{AbstractVector{<:SupportedTypes},Function}, - allowempty::Bool=false) where {T, N} - if !allowempty && !allunique(breaks) - throw(ArgumentError("all breaks must be unique unless `allowempty=true`")) - end - + sigdigits::Integer, + allowempty::Bool) where {T, N} if !issorted(breaks) breaks = sort(breaks) end + if any(x -> x isa Number && isnan(x), breaks) + throw(ArgumentError("NaN values are not allowed in breaks")) + end + + if !allowempty && !allunique(@view breaks[1:end-1]) + throw(ArgumentError("all breaks other than the last one must be unique " * + "unless `allowempty=true`")) + end + if extend === true xnm = T >: Missing ? skipmissing(x) : x length(breaks) >= 1 || throw(ArgumentError("at least one break must be provided")) @@ -160,11 +216,11 @@ function _cut(x::AbstractArray{T, N}, breaks::AbstractVector, rethrow(err) end end - if !ismissing(min_x) && breaks[1] > min_x + if !ismissing(min_x) && isless(min_x, breaks[1]) # this type annotation is needed on Julia<1.7 for stable inference breaks = [min_x::nonmissingtype(eltype(x)); breaks] end - if !ismissing(max_x) && breaks[end] < max_x + if !ismissing(max_x) && isless(breaks[end], max_x) breaks = [breaks; max_x::nonmissingtype(eltype(x))] end length(breaks) > 1 || @@ -185,22 +241,60 @@ function _cut(x::AbstractArray{T, N}, breaks::AbstractVector, end end + # Find minimal number of digits so that distinct breaks remain so + if eltype(breaks) <: AbstractFloat + while true + local i + for outer i in 2:lastindex(breaks) + b1 = breaks[i-1] + b2 = breaks[i] + isequal(b1, b2) && continue + + @static if VERSION >= v"1.9" + b1_str = Printf.format(CUT_FMT, sigdigits, b1) + b2_str = Printf.format(CUT_FMT, sigdigits, b2) + else + b1_str = Printf.format(Printf.Format("%.$(sigdigits)g"), b1) + b2_str = Printf.format(Printf.Format("%.$(sigdigits)g"), b2) + end + if b1_str == b2_str + sigdigits += 1 + break + end + end + i == lastindex(breaks) && break + end + end n = length(breaks) n >= 2 || throw(ArgumentError("at least two breaks must be provided when extend is not true")) if labels isa Function from = breaks[1:n-1] to = breaks[2:n] - firstlevel = labels(from[1], to[1], 1, - leftclosed=breaks[1] != breaks[2], rightclosed=false) + local firstlevel + try + firstlevel = labels(from[1], to[1], 1, + leftclosed=!isequal(breaks[1], breaks[2]), rightclosed=false, + sigdigits=sigdigits) + catch + # Support functions defined before v1.0, where sigdigits did not exist + Base.depwarn("`labels` function is now required to accept a `sigdigits` keyword argument", + :cut) + labels_orig = labels + labels = (from, to, i; leftclosed, rightclosed, sigdigits) -> + labels_orig(from, to, i; leftclosed, rightclosed) + firstlevel = labels_orig(from[1], to[1], 1, + leftclosed=!isequal(breaks[1], breaks[2]), rightclosed=false) + end levs = Vector{typeof(firstlevel)}(undef, n-1) levs[1] = firstlevel for i in 2:n-2 levs[i] = labels(from[i], to[i], i, - leftclosed=breaks[i] != breaks[i+1], rightclosed=false) + leftclosed=!isequal(breaks[i], breaks[i+1]), rightclosed=false, + sigdigits=sigdigits) end levs[end] = labels(from[end], to[end], n-1, - leftclosed=breaks[end-1] != breaks[end], - rightclosed=coalesce(extend, false)) + leftclosed=true, rightclosed=true, + sigdigits=sigdigits) else length(labels) == n-1 || throw(ArgumentError("labels must be of length $(n-1), but got length $(length(labels))")) @@ -221,45 +315,114 @@ function _cut(x::AbstractArray{T, N}, breaks::AbstractVector, end """ - quantile_formatter(from, to, i; leftclosed, rightclosed) - -Provide the default label format for the `cut(x, ngroups)` method. +Find first value in (sorted) `v` which is greater than or equal to each quantile +in (sorted) `qs`. """ -quantile_formatter(from, to, i; leftclosed, rightclosed) = - string("Q", i, ": ", leftclosed ? "[" : "(", from, ", ", to, rightclosed ? "]" : ")") +function find_breaks(v::AbstractVector, qs::AbstractVector) + n = length(qs) + breaks = similar(v, n) + n == 0 && return breaks + + i = 1 + q = qs[1] + @inbounds for x in v + # Use isless and isequal to differentiate -0.0 from 0.0 + if isless(q, x) || isequal(q, x) + breaks[i] = x + i += 1 + i > n && break + q = qs[i] + end + end + return breaks +end + +# AbstractWeights method is defined in StatsBase extension +# There is no in-place weighted quantile method in StatsBase +_wquantile(x::AbstractArray, w::AbstractVector, p::AbstractVector) = + throw(ArgumentError("`weights` must be an `AbstractWeights` vector from StatsBase.jl")) """ cut(x::AbstractArray, ngroups::Integer; labels::Union{AbstractVector{<:AbstractString},Function}, - allowempty::Bool=false) + sigdigits::Integer=3, + allowempty::Bool=false, + weights::Union{AbstractWeights, Nothing}=nothing) + +Cut a numeric array into `ngroups` quantiles. -Cut a numeric array into `ngroups` quantiles, determined using `quantile`. +This is equivalent to `cut(x, quantile(x, (0:ngroups)/ngroups))`, +but breaks are taken from actual data values instead of estimated quantiles. If `x` contains `missing` values, they are automatically skipped when computing quantiles. +!!! note + For floating point data, breaks may be rounded to `sigdigits` significant digits + when generating interval labels, meaning that they may not reflect exactly the cutpoints + used. + # Keyword arguments * `labels::Union{AbstractVector, Function}`: a vector of strings, characters - or numbers giving the names to use for - the intervals; or a function `f(from, to, i; leftclosed, rightclosed)` that generates + or numbers giving the names to use for the intervals; or a function + `f(from, to, i::Integer; leftclosed::Bool, rightclosed::Bool, sigdigits::Integer)` that generates the labels from the left and right interval boundaries and the group index. Defaults to - `"Qi: [from, to)"` (or `"Qi: [from, to]"` for the rightmost interval). + [`CategoricalArrays.default_formatter`](@ref), giving `"[from, to)"` (or `"[from, to]"` + for the rightmost interval if `extend == true`) if `allowempty=false`, otherwise to + [`CategoricalArrays.numbered_formatter`](@ref), which prefixes the label with the quantile + number to ensure uniqueness. +* `sigdigits::Integer=3`: the minimum number of significant digits to use when rounding + breaks for inclusion in generated labels. This value is increased automatically if necessary + so that rounded breaks are unique. Only used for floating point types and when `labels` is a + function, in which case it is passed to it as a keyword argument. * `allowempty::Bool=false`: when `false`, an error is raised if some quantiles breakpoints - are equal, generating empty intervals; when `true`, duplicate breaks are allowed - and the intervals they generate are kept as unused levels - (but duplicate labels are not allowed). + other than the last one are equal, generating empty intervals; + when `true`, duplicate breaks are allowed and the intervals they generate are kept as + unused levels (but duplicate labels are not allowed). +* `weights::Union{AbstractWeights, Nothing}=nothing`: observations weights to used when + computing quantiles (see `quantile` documentation in StatsBase). """ function cut(x::AbstractArray, ngroups::Integer; - labels::Union{AbstractVector{<:SupportedTypes},Function}=quantile_formatter, - allowempty::Bool=false) - xnm = eltype(x) >: Missing ? skipmissing(x) : x - breaks = Statistics.quantile(xnm, (1:ngroups-1)/ngroups) - if !allowempty && !allunique(breaks) - n = length(unique(breaks)) - 1 - throw(ArgumentError("cannot compute $ngroups quantiles: `quantile` " * - "returned only $n groups due to duplicated values in `x`." * + labels::Union{AbstractVector{<:SupportedTypes},Function,Nothing}=nothing, + sigdigits::Integer=3, + allowempty::Bool=false, + weights::Union{AbstractVector, Nothing}=nothing) + ngroups >= 1 || throw(ArgumentError("ngroups must be strictly positive (got $ngroups)")) + if weights === nothing + sorted_x = eltype(x) >: Missing ? sort!(collect(skipmissing(x))) : sort(x) + min_x, max_x = first(sorted_x), last(sorted_x) + if (min_x isa Number && isnan(min_x)) || + (max_x isa Number && isnan(max_x)) + throw(ArgumentError("NaN values are not allowed in input vector")) + end + qs = quantile!(sorted_x, (1:(ngroups-1))/ngroups, sorted=true) + else + if eltype(x) >: Missing + nm_inds = findall(!ismissing, x) + nm_x = view(x, nm_inds) + # TODO: use a view once this is supported (JuliaStats/StatsBase.jl#723) + nm_weights = weights[nm_inds] + else + nm_x = x + nm_weights = weights + end + sorted_x = sort(nm_x) + min_x, max_x = first(sorted_x), last(sorted_x) + if (min_x isa Number && isnan(min_x)) || + (max_x isa Number && isnan(max_x)) + throw(ArgumentError("NaN values are not allowed in input vector")) + end + qs = _wquantile(nm_x, nm_weights, (1:(ngroups-1))/ngroups) + end + breaks = [min_x; find_breaks(sorted_x, qs); max_x] + if !allowempty && !allunique(@view breaks[1:end-1]) + throw(ArgumentError("cannot compute $ngroups quantiles due to " * + "too many duplicated values in `x`. " * "Pass `allowempty=true` to allow empty quantiles or " * "choose a lower value for `ngroups`.")) end - cut(x, breaks; extend=true, labels=labels, allowempty=allowempty) + if labels === nothing + labels = allowempty ? numbered_formatter : default_formatter + end + return cut(x, breaks; labels=labels, sigdigits=sigdigits, allowempty=allowempty) end diff --git a/src/pool.jl b/src/pool.jl index 1018cfee..2df7e345 100644 --- a/src/pool.jl +++ b/src/pool.jl @@ -2,20 +2,18 @@ const catpool_seed = UInt === UInt32 ? 0xe3cf1386 : 0x356f2c715023f1a5 hashlevels(levs::AbstractVector) = foldl((h, x) -> hash(x, h), levs, init=catpool_seed) -CategoricalPool{T, R, V}(ordered::Bool=false) where {T, R, V} = - CategoricalPool{T, R, V}(T[], ordered) CategoricalPool{T, R}(ordered::Bool=false) where {T, R} = CategoricalPool{T, R}(T[], ordered) CategoricalPool{T}(ordered::Bool=false) where {T} = CategoricalPool{T, DefaultRefType}(T[], ordered) CategoricalPool{T, R}(levels::AbstractVector, ordered::Bool=false) where {T, R} = - CategoricalPool{T, R, CategoricalValue{T, R}}(convert(Vector{T}, levels), ordered) + CategoricalPool{T, R}(convert(Vector{T}, levels), ordered) CategoricalPool(levels::AbstractVector{T}, ordered::Bool=false) where {T} = CategoricalPool{T, DefaultRefType}(convert(Vector{T}, levels), ordered) CategoricalPool(invindex::Dict{T, R}, ordered::Bool=false) where {T, R <: Integer} = - CategoricalPool{T, R, CategoricalValue{T, R}}(invindex, ordered) + CategoricalPool{T, R}(invindex, ordered) Base.convert(::Type{T}, pool::T) where {T <: CategoricalPool} = pool @@ -23,26 +21,26 @@ Base.convert(::Type{CategoricalPool{S}}, pool::CategoricalPool{T, R}) where {S, convert(CategoricalPool{S, R}, pool) function Base.convert(::Type{CategoricalPool{T, R}}, pool::CategoricalPool) where {T, R <: Integer} - if length(levels(pool)) > typemax(R) - throw(LevelsException{T, R}(levels(pool)[typemax(R)+1:end])) + if length(pool.levels) > typemax(R) + throw(LevelsException{T, R}(pool.levels[typemax(R)+1:end])) end levelsT = convert(Vector{T}, pool.levels) invindexT = convert(Dict{T, R}, pool.invindex) - return CategoricalPool{T, R, CategoricalValue{T, R}}(levelsT, invindexT, pool.ordered) + return CategoricalPool{T, R}(levelsT, invindexT, pool.ordered) end -Base.copy(pool::CategoricalPool{T, R, V}) where {T, R, V} = - CategoricalPool{T, R, V}(copy(pool.levels), copy(pool.invindex), - pool.ordered, pool.hash) +Base.copy(pool::CategoricalPool{T, R}) where {T, R} = + CategoricalPool{T, R}(copy(pool.levels), copy(pool.invindex), + pool.ordered, pool.hash) function Base.show(io::IO, pool::CategoricalPool{T, R}) where {T, R} @static if VERSION >= v"1.6.0" @printf(io, "%s{%s, %s}([%s])", CategoricalPool, T, R, - join(map(repr, levels(pool)), ", ")) + join(map(repr, pool.levels), ", ")) else @printf(io, "%s{%s,%s}([%s])", CategoricalPool, T, R, - join(map(repr, levels(pool)), ", ")) + join(map(repr, pool.levels), ", ")) end pool.ordered && print(io, " with ordered levels") @@ -67,8 +65,10 @@ it doesn't do this itself to avoid doing a dict lookup twice i = R(n + 1) push!(pool.levels, x) - if pool.hash !== nothing - pool.hash = hash(x, pool.hash) + push!(pool.levelsinds, i) + pool_hash = pool.hash + if pool_hash !== nothing + pool.hash = hash(x, pool_hash) end pool.equalto = C_NULL pool.subsetof = C_NULL @@ -186,10 +186,10 @@ function merge_pools(a::CategoricalPool{T}, b::CategoricalPool) where {T} newlevs = T[] ordered = isordered(a) elseif length(a) == 0 - newlevs = Vector{T}(levels(b)) + newlevs = Vector{T}(b.levels) ordered = isordered(b) elseif length(b) == 0 - newlevs = copy(levels(a)) + newlevs = copy(a.levels) ordered = isordered(a) else ordered = isordered(a) && (isordered(b) || b ⊆ a) @@ -201,7 +201,7 @@ end @inline function Base.hash(pool::CategoricalPool, h::UInt) if pool.hash === nothing - pool.hash = hashlevels(levels(pool)) + pool.hash = hashlevels(pool.levels) end hash(pool.hash, h) end @@ -247,9 +247,9 @@ end # Contrary to the CategoricalArray one, this method only allows adding new levels at the end # so that existing CategoricalValue objects still point to the same value -function levels!(pool::CategoricalPool{S, R}, newlevels::Vector; +function levels!(pool::CategoricalPool{S, R}, newlevels::AbstractVector; checkunique::Bool=true) where {S, R} - levs = convert(Vector{S}, newlevels) + levs = newlevels isa CategoricalVector{S} ? newlevels : convert(Vector{S}, newlevels) if checkunique && !allunique(levs) throw(ArgumentError(string("duplicated levels found in levs: ", join(unique(filter(x->sum(levs.==x)>1, levs)), ", ")))) @@ -260,24 +260,30 @@ function levels!(pool::CategoricalPool{S, R}, newlevels::Vector; n = length(levs) if n > typemax(R) - throw(LevelsException{S, R}(setdiff(levs, levels(pool))[typemax(R)-length(levels(pool))+1:end])) + throw(LevelsException{S, R}(setdiff(levs, pool.levels)[typemax(R)-length(pool.levels)+1:end])) end empty!(pool.invindex) resize!(pool.levels, n) + resize!(pool.levelsinds, n) pool.hash = nothing pool.equalto = C_NULL pool.subsetof = C_NULL for i in 1:n v = levs[i] pool.levels[i] = v + pool.levelsinds[i] = i pool.invindex[v] = i end return pool end -DataAPI.levels(pool::CategoricalPool) = pool.levels +DataAPI.levels(pool::CategoricalPool{T}) where {T} = + CategoricalVector{T}(pool.levelsinds, pool) +levels_missing(pool::CategoricalPool{T}) where {T} = + CategoricalVector{Union{T, Missing}}(pool.levelsinds, pool) +_levels(pool::CategoricalPool) = pool.levels isordered(pool::CategoricalPool) = pool.ordered ordered!(pool::CategoricalPool, ordered) = (pool.ordered = ordered; pool) diff --git a/src/recode.jl b/src/recode.jl index 282d4fb6..ff258e60 100644 --- a/src/recode.jl +++ b/src/recode.jl @@ -52,27 +52,34 @@ A user defined type could override this method to define an appropriate test fun optimize_pair(pair::Pair) = pair optimize_pair(pair::Pair{<:AbstractArray}) = Set(pair.first) => pair.second -function recode!(dest::AbstractArray{T}, src::AbstractArray, default::Any, pairs::Pair...) where {T} +function recode!(dest::AbstractArray, src::AbstractArray, default::Any, pairs::Pair...) if length(dest) != length(src) throw(DimensionMismatch("dest and src must be of the same length (got $(length(dest)) and $(length(src)))")) end - opt_pairs = map(optimize_pair, pairs) + opt_pairs = optimize_pair.(pairs) + _recode!(dest, src, default, opt_pairs) +end + +function _recode!(dest::AbstractArray{T}, src::AbstractArray, default, + pairs::NTuple{<:Any, Pair}) where {T} + recode_to = last.(pairs) + recode_from = first.(pairs) + @inbounds for i in eachindex(dest, src) x = src[i] - for j in 1:length(opt_pairs) - p = opt_pairs[j] - # we use isequal and recode_in because we cannot really distinguish scalars from collections - if x ≅ p.first || recode_in(x, p.first) - dest[i] = p.second - @goto nextitem - end - end - + # @inline is needed for type stability and Compat for compatibility before julia v1.8 + # we use isequal and recode_in because we cannot really + # distinguish scalars from collections + j = Compat.@inline findfirst(y -> isequal(x, y) || recode_in(x,y), recode_from) + + # Value in one of the pairs + if j !== nothing + dest[i] = recode_to[j] # Value not in any of the pairs - if ismissing(x) + elseif ismissing(x) eltype(dest) >: Missing || throw(MissingException("missing value found, but dest does not support them: " * "recode them to a supported value")) @@ -89,45 +96,40 @@ function recode!(dest::AbstractArray{T}, src::AbstractArray, default::Any, pairs else dest[i] = default end - - @label nextitem end dest end -function recode!(dest::CategoricalArray{T}, src::AbstractArray, default::Any, pairs::Pair...) where {T} - if length(dest) != length(src) - throw(DimensionMismatch("dest and src must be of the same length (got $(length(dest)) and $(length(src)))")) - end - - opt_pairs = map(optimize_pair, pairs) +function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, default::Any, + pairs::NTuple{<:Any, Pair}) where {T, R} + recode_from = first.(pairs) + vals = T[p.second for p in pairs] - vals = T[p.second for p in opt_pairs] default !== nothing && push!(vals, default) levels!(dest.pool, filter!(!ismissing, unique(vals))) # In the absence of duplicated recoded values, we do not need to lookup the reference # for each pair in the loop, which is more efficient (with loop unswitching) - dupvals = length(vals) != length(levels(dest.pool)) + dupvals = length(vals) != length(_levels(dest.pool)) drefs = dest.refs - pairmap = [ismissing(v) ? 0 : get(dest.pool, v) for v in vals] - defaultref = default === nothing || ismissing(default) ? 0 : get(dest.pool, default) + pairmap = [ismissing(v) ? zero(R) : get(dest.pool, v) for v in vals] + defaultref = default === nothing || ismissing(default) ? zero(R) : get(dest.pool, default) + @inbounds for i in eachindex(drefs, src) x = src[i] - for j in 1:length(opt_pairs) - p = opt_pairs[j] - # we use isequal and recode_in because we cannot really distinguish scalars from collections - if x ≅ p.first || recode_in(x, p.first) - drefs[i] = dupvals ? pairmap[j] : j - @goto nextitem - end - end + # @inline is needed for type stability and Compat for compatibility before julia v1.8 + # we use isequal and recode_in because we cannot really + # distinguish scalars from collections + j = Compat.@inline findfirst(y -> isequal(x, y) || recode_in(x, y), recode_from) + # Value in one of the pairs + if j !== nothing + drefs[i] = dupvals ? pairmap[j] : j # Value not in any of the pairs - if ismissing(x) + elseif ismissing(x) eltype(dest) >: Missing || throw(MissingException("missing value found, but dest does not support them: " * "recode them to a supported value")) @@ -144,13 +146,11 @@ function recode!(dest::CategoricalArray{T}, src::AbstractArray, default::Any, pa else drefs[i] = defaultref end - - @label nextitem end # Put existing levels first, and sort them if possible # for consistency with CategoricalArray - oldlevels = setdiff(levels(dest), vals) + oldlevels = setdiff(_levels(dest), vals) filter!(!ismissing, oldlevels) L = eltype(oldlevels) if Base.OrderStyle(L) isa Base.Ordered @@ -163,30 +163,26 @@ function recode!(dest::CategoricalArray{T}, src::AbstractArray, default::Any, pa e isa MethodError || rethrow(e) end end - levels!(dest, union(oldlevels, levels(dest))) + levels!(dest, union(oldlevels, _levels(dest))) dest end -function recode!(dest::CategoricalArray{T, N, R}, src::CategoricalArray, - default::Any, pairs::Pair...) where {T, N, R<:Integer} - if length(dest) != length(src) - throw(DimensionMismatch("dest and src must be of the same length " * - "(got $(length(dest)) and $(length(src)))")) - end - +function _recode!(dest::CategoricalArray{T, N, R}, src::CategoricalArray, + default::Any, pairs::NTuple{<:Any, Pair}) where {T, N, R<:Integer} + recode_from = first.(pairs) vals = T[p.second for p in pairs] + if default === nothing - srclevels = levels(src) + srclevels = _levels(src) # Remove recoded levels as they won't appear in result - firsts = (p.first for p in pairs) keptlevels = Vector{T}(undef, 0) sizehint!(keptlevels, length(srclevels)) for l in srclevels - if !(any(x -> x ≅ l, firsts) || - any(f -> recode_in(l, f), firsts)) + if !(any(x -> x ≅ l, recode_from) || + any(f -> recode_in(l, f), recode_from)) try push!(keptlevels, l) catch err @@ -205,7 +201,7 @@ function recode!(dest::CategoricalArray{T, N, R}, src::CategoricalArray, ordered = false end - srclevels = src.pool === dest.pool ? copy(levels(src.pool)) : levels(src.pool) + srclevels = src.pool === dest.pool ? copy(_levels(src.pool)) : _levels(src.pool) if length(levs) > length(srclevels) && view(levs, 1:length(srclevels)) == srclevels levels!(dest.pool, levs) else diff --git a/src/subarray.jl b/src/subarray.jl index 3e5f3f39..d7bf72df 100644 --- a/src/subarray.jl +++ b/src/subarray.jl @@ -5,13 +5,6 @@ isordered(sa::SubArray{T,N,P}) where {T,N,P<:CategoricalArray} = isordered(paren levels!(sa::SubArray{T,N,P}, newlevels::Vector) where {T,N,P<:CategoricalArray} = levels!(parent(sa), newlevels) -function unique(sa::SubArray{T,N,P}) where {T,N,P<:CategoricalArray} - A = parent(sa) - refs = view(A.refs, sa.indices...) - S = eltype(P) >: Missing ? Union{eltype(levels(A.pool)), Missing} : eltype(levels(A.pool)) - _unique(S, refs, A.pool) -end - refs(A::SubArray{<:Any, <:Any, <:CategoricalArray}) = view(parent(A).refs, parentindices(A)...) diff --git a/src/typedefs.jl b/src/typedefs.jl index 973cbaf8..238bb995 100644 --- a/src/typedefs.jl +++ b/src/typedefs.jl @@ -6,17 +6,17 @@ const SupportedTypes = Union{AbstractString, AbstractChar, Number} # Type params: # * `T` type of categorized values # * `R` integer type for referencing category levels -# * `V` categorical value type -mutable struct CategoricalPool{T <: SupportedTypes, R <: Integer, V} +mutable struct CategoricalPool{T <: SupportedTypes, R <: Integer} levels::Vector{T} # category levels ordered by their reference codes + levelsinds::Vector{R} # set to 1:length(levels), used by `levels(p)` invindex::Dict{T, R} # map from category levels to their reference codes ordered::Bool # whether levels can be compared using < hash::Union{UInt, Nothing} # hash of levels subsetof::Ptr{Nothing} # last seen strict superset pool equalto::Ptr{Nothing} # last seen equal pool - function CategoricalPool{T, R, V}(levels::Vector{T}, - ordered::Bool) where {T, R, V} + function CategoricalPool{T, R}(levels::Vector{T}, + ordered::Bool) where {T, R} if length(levels) > typemax(R) throw(LevelsException{T, R}(levels[Int(typemax(R))+1:end])) end @@ -24,10 +24,10 @@ mutable struct CategoricalPool{T <: SupportedTypes, R <: Integer, V} if length(invindex) != length(levels) throw(ArgumentError("Duplicate entries are not allowed in levels")) end - CategoricalPool{T, R, V}(levels, invindex, ordered) + CategoricalPool{T, R}(levels, invindex, ordered) end - function CategoricalPool{T, R, V}(invindex::Dict{T, R}, - ordered::Bool) where {T, R, V} + function CategoricalPool{T, R}(invindex::Dict{T, R}, + ordered::Bool) where {T, R} levels = Vector{T}(undef, length(invindex)) # If invindex contains non consecutive values, a BoundsError will be thrown try @@ -40,20 +40,14 @@ mutable struct CategoricalPool{T <: SupportedTypes, R <: Integer, V} if length(invindex) > typemax(R) throw(LevelsException{T, R}(levels[typemax(R)+1:end])) end - CategoricalPool{T, R, V}(levels, invindex, ordered) + CategoricalPool{T, R}(levels, invindex, ordered) end - function CategoricalPool{T, R, V}(levels::Vector{T}, - invindex::Dict{T, R}, - ordered::Bool, - hash::Union{UInt, Nothing}=nothing) where {T, R, V} - if !(V <: CategoricalValue) - throw(ArgumentError("Type $V is not a categorical value type")) - end - if V !== CategoricalValue{T, R} - throw(ArgumentError("V must be CategoricalValue{T, R}")) - end - pool = new(levels, invindex, ordered, hash, C_NULL, C_NULL) - return pool + function CategoricalPool{T, R}(levels::Vector{T}, + invindex::Dict{T, R}, + ordered::Bool, + hash::Union{UInt, Nothing}=nothing) where {T, R} + return new(levels, 1:length(levels), invindex, + ordered, hash, C_NULL, C_NULL) end end @@ -77,7 +71,7 @@ the order of the pool's [`levels`](@ref DataAPI.levels) is used rather than the ordering of values of type `T`. """ struct CategoricalValue{T <: SupportedTypes, R <: Integer} - pool::CategoricalPool{T, R, CategoricalValue{T, R}} + pool::CategoricalPool{T, R} ref::R end @@ -98,14 +92,14 @@ const AbstractCategoricalMatrix{T, R, V, C, U} = AbstractCategoricalArray{T, 2, mutable struct CategoricalArray{T, N, R <: Integer, V, C, U} <: AbstractCategoricalArray{T, N, R, V, C, U} refs::Array{R, N} - pool::CategoricalPool{V, R, C} + pool::CategoricalPool{V, R} function CategoricalArray{T, N}(refs::Array{R, N}, - pool::CategoricalPool{V, R, C}) where - {T, N, R <: Integer, V, C} + pool::CategoricalPool{V, R}) where + {T, N, R <: Integer, V} T === V || T == Union{V, Missing} || throw(ArgumentError("T ($T) must be equal to $V or Union{$V, Missing}")) U = T >: Missing ? Missing : Union{} - new{T, N, R, V, C, U}(refs, pool) + new{T, N, R, V, CategoricalValue{V, R}, U}(refs, pool) end end const CategoricalVector{T, R <: Integer, V, C, U} = CategoricalArray{T, 1, R, V, C, U} diff --git a/src/value.jl b/src/value.jl index ae962adb..a1633204 100644 --- a/src/value.jl +++ b/src/value.jl @@ -27,6 +27,8 @@ reftype(x::Any) = reftype(typeof(x)) pool(x::CategoricalValue) = x.pool refcode(x::CategoricalValue) = x.ref isordered(x::CategoricalValue) = isordered(x.pool) +DataAPI.levels(x::CategoricalValue) = levels(pool(x)) +_levels(x::CategoricalValue) = _levels(pool(x)) # extract the type of the original value from array eltype `T` unwrap_catvaluetype(::Type{T}) where {T} = T @@ -42,7 +44,7 @@ unwrap_catvaluetype(::Type{T}) where {T <: CategoricalValue} = leveltype(T) Get the value wrapped by categorical value `x`. If `x` is `Missing` return `missing`. """ -DataAPI.unwrap(x::CategoricalValue) = levels(x)[refcode(x)] +DataAPI.unwrap(x::CategoricalValue) = _levels(x)[refcode(x)] """ levelcode(x::CategoricalValue) @@ -59,10 +61,8 @@ Return `missing`. """ levelcode(x::Missing) = missing -DataAPI.levels(x::CategoricalValue) = levels(pool(x)) - function cat_promote_type(::Type{S}, ::Type{T}) where {S, T} - U = promote_type(S, T) + U = promote_type(unwrap_catvaluetype(S), unwrap_catvaluetype(T)) U <: Union{SupportedTypes, Missing} ? U : typeintersect(Union{SupportedTypes, Missing}, Union{S, T}) end diff --git a/test/01_value.jl b/test/01_value.jl index 39f58b67..8c60ae7f 100644 --- a/test/01_value.jl +++ b/test/01_value.jl @@ -22,6 +22,8 @@ end for i in 1:3 x = CategoricalValue(pool, i) + @test levels(x) == levels(pool) + @test levels(x) isa CategoricalVector{String, UInt32} @test leveltype(x) === String @test leveltype(typeof(x)) === String @test reftype(x) === DefaultRefType @@ -48,6 +50,8 @@ end for i in 1:3 x = CategoricalValue(pool, i) + @test levels(x) == levels(pool) + @test levels(x) isa CategoricalVector{String, UInt8} @test leveltype(x) === String @test leveltype(typeof(x)) === String @test reftype(x) === UInt8 @@ -68,7 +72,8 @@ end for x in (CategoricalValue(pool, 1), arr, view(arr, 2:3)) for (i, v) in enumerate(levels(pool)) @test CategoricalValue(v, x) === - CategoricalValue(float(v), x) === + CategoricalValue(unwrap(v), x) === + CategoricalValue(float(unwrap(v)), x) === CategoricalValue(CategoricalValue(pool, i), x) === CategoricalValue(pool, i) end diff --git a/test/04_constructors.jl b/test/04_constructors.jl index 5b39f95e..2d4eb4b0 100644 --- a/test/04_constructors.jl +++ b/test/04_constructors.jl @@ -5,22 +5,10 @@ using CategoricalArrays: DefaultRefType @testset "Type parameter constraints" begin # cannot use categorical value as level type - @test_throws TypeError CategoricalPool{CategoricalValue{Int,UInt8}, UInt8, CategoricalValue{CategoricalValue{Int,UInt8},UInt8}}( + @test_throws TypeError CategoricalPool{CategoricalValue{Int,UInt8}, UInt8}( Dict{CategoricalValue{Int,UInt8}, UInt8}(), false) - @test_throws TypeError CategoricalPool{CategoricalValue{Int,UInt8}, UInt8, CategoricalValue{CategoricalValue{Int,UInt8},UInt8}}( + @test_throws TypeError CategoricalPool{CategoricalValue{Int,UInt8}, UInt8}( CategoricalValue{Int,UInt8}[], false) - # cannot use non-categorical value as categorical value type - @test_throws ArgumentError CategoricalPool{Int, UInt8, Int}(Int[], false) - @test_throws ArgumentError CategoricalPool{Int, UInt8, Int}(Dict{Int, UInt8}(), false) - # level type of the pool and categorical value must match - @test_throws ArgumentError CategoricalPool{Int, UInt8, CategoricalValue{String, UInt8}}(Int[], false) - @test_throws ArgumentError CategoricalPool{Int, UInt8, CategoricalValue{String, UInt8}}(Dict{Int, UInt8}(), false) - # reference type of the pool and categorical value must match - @test_throws ArgumentError CategoricalPool{Int, UInt8, CategoricalValue{Int, UInt16}}(Int[], false) - @test_throws ArgumentError CategoricalPool{Int, UInt8, CategoricalValue{Int, UInt16}}(Dict{Int, UInt8}(), false) - # correct types combination - @test CategoricalPool{Int, UInt8, CategoricalValue{Int, UInt8}}(Int[], false) isa CategoricalPool - @test CategoricalPool{Int, UInt8, CategoricalValue{Int, UInt8}}(Dict{Int, UInt8}(), false) isa CategoricalPool end @testset "empty CategoricalPool{String}" begin @@ -38,7 +26,7 @@ end @testset "empty CategoricalPool{Int}" begin pool = CategoricalPool{Int, UInt8}() - @test isa(pool, CategoricalPool{Int, UInt8, CategoricalValue{Int, UInt8}}) + @test isa(pool, CategoricalPool{Int, UInt8}) @test isa(pool.levels, Vector{Int}) @test length(pool.levels) == 0 @@ -50,7 +38,7 @@ end @testset "CategoricalPool{String, DefaultRefType}(a b c)" begin pool = CategoricalPool(["a", "b", "c"]) - @test isa(pool, CategoricalPool{String, UInt32, CategoricalValue{String, UInt32}}) + @test isa(pool, CategoricalPool{String, UInt32}) @test isa(pool.levels, Vector{String}) @test pool.levels == ["a", "b", "c"] @@ -156,7 +144,7 @@ end @testset "CategoricalPool{Float64, UInt8}()" begin pool = CategoricalPool{Float64, UInt8}([1.0, 2.0, 3.0]) - @test isa(pool, CategoricalPool{Float64, UInt8, CategoricalValue{Float64, UInt8}}) + @test isa(pool, CategoricalPool{Float64, UInt8}) @test CategoricalValue(pool, 1) isa CategoricalValue{Float64, UInt8} end diff --git a/test/05_convert.jl b/test/05_convert.jl index b9b93544..3e7c98be 100644 --- a/test/05_convert.jl +++ b/test/05_convert.jl @@ -55,9 +55,9 @@ using CategoricalArrays: DefaultRefType, refcode, reftype, leveltype @test convert(Union{T, U}, v3)::T == v3 end - @test unwrap(v1) === get(v1) === 1 - @test unwrap(v2) === get(v2) === 2 - @test unwrap(v3) === get(v3) === 3 + @test unwrap(v1) === 1 + @test unwrap(v2) === 2 + @test unwrap(v3) === 3 @test promote(1, v1) === (1, 1) @test promote(1.0, v1) === (1.0, 1.0) diff --git a/test/07_levels.jl b/test/07_levels.jl index 25c54be0..b54e4d52 100644 --- a/test/07_levels.jl +++ b/test/07_levels.jl @@ -1,15 +1,16 @@ module TestLevels using Test using CategoricalArrays -using CategoricalArrays: DefaultRefType, levels!, hashlevels +using CategoricalArrays: DefaultRefType, levels!, hashlevels, _levels @testset "CategoricalPool{Int} updates levels and order correctly" begin pool = CategoricalPool([2, 1, 3]) - @test isa(levels(pool), Vector{Int}) + @test isa(levels(pool), CategoricalVector{Int, DefaultRefType}) @test length(pool) === 3 - @test levels(pool) == [2, 1, 3] - @test all([levels(CategoricalValue(pool, i)) for i in 1:3] .=== Ref(levels(pool))) + @test levels(pool) == _levels(pool) == [2, 1, 3] + @test all([levels(CategoricalValue(pool, i)) for i in 1:3] .== Ref(levels(pool))) + @test pool.levelsinds == 1:3 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3) @test pool.hash === nothing @test pool.equalto == C_NULL @@ -20,7 +21,8 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test isa(pool.levels, Vector{Int}) @test length(pool) === 4 - @test levels(pool) == [2, 1, 3, 4] + @test levels(pool) == _levels(pool) == [2, 1, 3, 4] + @test pool.levelsinds == 1:4 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4) @test pool.hash === nothing @test pool.equalto == C_NULL @@ -34,7 +36,8 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test isa(pool.levels, Vector{Int}) @test length(pool) === 5 - @test levels(pool) == [2, 1, 3, 4, 0] + @test levels(pool) == _levels(pool) == [2, 1, 3, 4, 0] + @test pool.levelsinds == 1:5 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5) @test pool.hash === nothing @test pool.equalto == C_NULL @@ -48,7 +51,8 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test isa(pool.levels, Vector{Int}) @test length(pool) === 7 - @test levels(pool) == [2, 1, 3, 4, 0, 10, 11] + @test levels(pool) == _levels(pool) == [2, 1, 3, 4, 0, 10, 11] + @test pool.levelsinds == 1:7 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7) @test pool.hash === nothing @test pool.equalto == C_NULL @@ -64,7 +68,8 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test isa(pool.levels, Vector{Int}) @test length(pool) === 9 - @test levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13] + @test levels(pool) == _levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13] + @test pool.levelsinds == 1:9 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7, 12=>8, 13=>9) @test pool.hash === nothing @test pool.equalto == C_NULL @@ -84,15 +89,10 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels # Adding levels while preserving existing ones levs = [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14] @test levels!(pool, levs) === pool - @test levels(pool) == levs - @test levels(pool) !== levs - @test pool.hash === nothing - @test pool.equalto == C_NULL - @test pool.subsetof == C_NULL - + @test levels(pool) == _levels(pool) == levs + @test pool.levels !== levs @test isa(pool.levels, Vector{Int}) - @test length(pool) === 11 - @test levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14] + @test pool.levelsinds == 1:11 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7, 12=>8, 13=>9, 15=>10, 14=>11) @test pool.hash === nothing @@ -109,7 +109,10 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test isa(pool.levels, Vector{Int}) @test length(pool) == 12 - @test levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20] + @test levels(pool) == + _levels(pool) == + [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20] + @test pool.levelsinds == 1:12 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7, 12=>8, 13=>9, 15=>10, 14=>11, 20=>12) @test pool.hash === nothing @@ -128,7 +131,10 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test isa(pool.levels, Vector{Int}) @test length(pool) == 14 - @test levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20, 100, 99] + @test levels(pool) == + _levels(pool) == + [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20, 100, 99] + @test pool.levelsinds == 1:14 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7, 12=>8, 13=>9, 15=>10, 14=>11, 20=>12, 100=>13, 99=>14) @test pool.hash === nothing @@ -143,7 +149,10 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test isa(pool.levels, Vector{Int}) @test length(pool) == 14 - @test levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20, 100, 99] + @test levels(pool) == + _levels(pool) == + [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20, 100, 99] + @test pool.levelsinds == 1:14 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7, 12=>8, 13=>9, 15=>10, 14=>11, 20=>12, 100=>13, 99=>14) @test pool.hash === CategoricalArrays.hashlevels(levels(pool)) @@ -155,7 +164,10 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test isa(pool.levels, Vector{Int}) @test length(pool) == 14 - @test levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20, 100, 99] + @test levels(pool) == + _levels(pool) == + [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20, 100, 99] + @test pool.levelsinds == 1:14 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7, 12=>8, 13=>9, 15=>10, 14=>11, 20=>12, 100=>13, 99=>14) @test pool.hash === CategoricalArrays.hashlevels(levels(pool)) @@ -178,6 +190,22 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test !isordered(p2) end +@testset "levels!(::CategoricalPool, ::CategoricalVector)" begin + pool = CategoricalPool([2, 1, 3]) + levels!(pool, categorical([2, 1, 3, 4])) + @test levels(pool) == [2, 1, 3, 4] + + pool = CategoricalPool([2, 1, 3]) + levels!(pool, categorical([2.0, 1.0, 3.0, 4.0])) + @test levels(pool) == [2, 1, 3, 4] + + pool = CategoricalPool([2, 1, 3]) + @test_throws ArgumentError levels!(pool, categorical([2, 2, 1, 3, 4])) + + pool = CategoricalPool([2, 1, 3]) + @test_throws ArgumentError levels!(pool, categorical(1:3)) +end + @testset "overflow of reftype is detected and doesn't corrupt levels" begin res = @test_throws LevelsException{Int, UInt8} CategoricalPool{Int, UInt8}(collect(256:-1:1)) @test res.value.levels == [1] diff --git a/test/11_array.jl b/test/11_array.jl index 1edd2fef..4f332640 100644 --- a/test/11_array.jl +++ b/test/11_array.jl @@ -16,6 +16,7 @@ using CategoricalArrays: DefaultRefType, leveltype @test isordered(x) === ordered @test levels(x) == sort(unique(a)) @test unique(x) == unique(a) + @test typeof(unique(x)) === typeof(x) @test size(x) === (3,) @test length(x) === 3 @@ -272,6 +273,7 @@ using CategoricalArrays: DefaultRefType, leveltype @test x == collect(a) @test isordered(x) === ordered @test levels(x) == unique(x) == unique(a) + @test typeof(unique(x)) === typeof(x) @test size(x) === (4,) @test length(x) === 4 @test leveltype(x) === Float64 @@ -437,6 +439,7 @@ using CategoricalArrays: DefaultRefType, leveltype @test x[4] === CategoricalValue(x.pool, 4) @test levels(x) == unique(a) @test unique(x) == unique(collect(x)) + @test typeof(unique(x)) === typeof(x) x[1:2] .= -1 @test x[1] === CategoricalValue(x.pool, 5) @@ -473,6 +476,7 @@ using CategoricalArrays: DefaultRefType, leveltype @test x == a @test isordered(x) === ordered @test levels(x) == unique(x) == unique(a) + @test unique(x) isa CategoricalVector{String, R} @test size(x) === (2, 3) @test length(x) === 6 @@ -715,7 +719,7 @@ using CategoricalArrays: DefaultRefType, leveltype @test levels(x) == ["c", "a", "b"] ordered!(x, ordered) - v = CategoricalValue(2, CategoricalPool(["xyz", "b"])) + v = CategoricalValue(CategoricalPool(["xyz", "b"]), 2) x[1] = v @test x[1] === CategoricalValue(x.pool, 4) @test x[2] === CategoricalValue(x.pool, 1) @@ -729,6 +733,7 @@ end @test levels!(x, ["Young", "Middle", "Old"]) === x @test levels(x) == ["Young", "Middle", "Old"] @test unique(x) == ["Old", "Young", "Middle"] + @test typeof(unique(x)) === typeof(x) @test levels!(x, ["Young", "Middle", "Old", "Unused"]) === x @test levels(x) == ["Young", "Middle", "Old", "Unused"] @test unique(x) == ["Old", "Young", "Middle"] @@ -736,20 +741,34 @@ end @test levels(x) == ["Unused1", "Young", "Middle", "Old", "Unused2"] @test unique(x) == ["Old", "Young", "Middle"] + y = copy(x) + @test unique!(y) === y + @test y == unique(x) + x = CategoricalArray(String[]) - @test isa(levels(x), Vector{String}) && isempty(levels(x)) - @test isa(unique(x), Vector{String}) && isempty(unique(x)) + @test isa(levels(x), CategoricalVector{String, DefaultRefType}) && isempty(levels(x)) + @test isa(unique(x), typeof(x)) && isempty(unique(x)) @test levels!(x, ["Young", "Middle", "Old"]) === x @test levels(x) == ["Young", "Middle", "Old"] - @test isa(unique(x), Vector{String}) && isempty(unique(x)) + @test isa(unique(x), typeof(x)) && isempty(unique(x)) + + y = copy(x) + @test unique!(y) === y + @test y == unique(x) # To test short-circuiting x = CategoricalArray(repeat(1:10, inner=10)) @test levels(x) == collect(1:10) @test unique(x) == collect(1:10) + @test unique(x) isa typeof(x) @test levels!(x, [19:-1:1; 20]) === x @test levels(x) == [19:-1:1; 20] @test unique(x) == collect(1:10) + @test unique(x) isa typeof(x) + + y = copy(x) + @test unique!(y) === y + @test y == 1:10 end end diff --git a/test/12_missingarray.jl b/test/12_missingarray.jl index fea335c2..5c2ed3a9 100644 --- a/test/12_missingarray.jl +++ b/test/12_missingarray.jl @@ -19,9 +19,14 @@ const ≅ = isequal @test isordered(x) === ordered @test levels(x) == sort(unique(a)) @test unique(x) == unique(a) + @test typeof(unique(x)) === typeof(x) @test size(x) === (3,) @test length(x) === 3 + y = copy(x) + @test y === unique!(y) + @test y == unique(x) + @test convert(CategoricalArray, x) === x @test convert(CategoricalArray{Union{String, Missing}}, x) === x @test convert(CategoricalArray{Union{String, Missing}, 1}, x) === x @@ -296,6 +301,7 @@ const ≅ = isequal @test x ≅ a @test levels(x) == filter(x->!ismissing(x), unique(a)) @test unique(x) ≅ unique(a) + @test typeof(unique(x)) === typeof(x) @test size(x) === (3,) @test length(x) === 3 @@ -440,6 +446,7 @@ const ≅ = isequal @test x == collect(a) @test isordered(x) === ordered @test levels(x) == unique(x) == unique(a) + @test typeof(unique(x)) === typeof(x) @test size(x) === (4,) @test length(x) === 4 @test leveltype(x) === Float64 @@ -616,6 +623,7 @@ const ≅ = isequal @test x[4] === CategoricalValue(x.pool, 4) @test levels(x) == unique(a) @test unique(x) == unique(collect(x)) + @test typeof(unique(x)) === typeof(x) x[1:2] .= -1 @test x[1] === CategoricalValue(x.pool, 5) @@ -625,6 +633,7 @@ const ≅ = isequal @test isordered(x) === false @test levels(x) == vcat(unique(a), -1) @test unique(x) == unique(collect(x)) + @test typeof(unique(x)) === typeof(x) ordered!(x, ordered) @@ -656,6 +665,7 @@ const ≅ = isequal @test x == a @test isordered(x) === ordered @test levels(x) == unique(x) == unique(a) + @test unique(x) isa CategoricalVector{Union{String, Missing}, R} @test size(x) === (2, 3) @test length(x) === 6 @@ -816,6 +826,7 @@ const ≅ = isequal @test isordered(x) === ordered @test levels(x) == filter(x->!ismissing(x), unique(a)) @test unique(x) ≅ unique(a) + @test unique(x) isa CategoricalVector{Union{String, Missing}, R} @test size(x) === (2, 3) @test length(x) === 6 @@ -1137,6 +1148,7 @@ end x = CategoricalArray(["Old", "Young", "Middle", missing, "Young"]) @test levels(x) == ["Middle", "Old", "Young"] @test unique(x) ≅ ["Old", "Young", "Middle", missing] + @test typeof(unique(x)) === typeof(x) @test levels!(x, ["Young", "Middle", "Old"]) === x @test levels(x) == ["Young", "Middle", "Old"] @test unique(x) ≅ ["Old", "Young", "Middle", missing] @@ -1148,7 +1160,7 @@ end @test unique(x) ≅ ["Old", "Young", "Middle", missing] x = CategoricalArray((Union{String, Missing})[missing]) - @test isa(levels(x), Vector{String}) && isempty(levels(x)) + @test isa(levels(x), CategoricalVector{String, DefaultRefType}) && isempty(levels(x)) @test unique(x) ≅ [missing] @test levels!(x, ["Young", "Middle", "Old"]) === x @test levels(x) == ["Young", "Middle", "Old"] diff --git a/test/13_arraycommon.jl b/test/13_arraycommon.jl index 20d61ef0..e95be673 100644 --- a/test/13_arraycommon.jl +++ b/test/13_arraycommon.jl @@ -8,8 +8,10 @@ using PooledArrays using JSON3 using StructTypes using RecipesBase -using Plots +using RecipesPipeline using SentinelArrays +using Arrow +using Missings const ≅ = isequal const ≇ = !isequal @@ -891,7 +893,7 @@ end @test sort(cv, rev=rev, by=byf1) ≅ sort(cv, rev=rev, by=byf1) # Check that by function is not called on unused levels/missing - byf2 = x -> (@assert get(x) != "b"; x) + byf2 = x -> (@assert x != "b"; x) replace!(cv, missing=>"a", "b"=>"a") @test sort(cv, rev=rev, by=byf2) ≅ sort(cv, rev=rev, by=byf2) end @@ -1328,18 +1330,116 @@ end @test levels(x) == [2, 1, 3, 4] end -@testset "Array(::CategoricalArray{T}) produces Array{T}" begin +@testset "Array(::CatArrOrSub{T}) produces Array{T}" begin x = [1,1,2,2] y = categorical(x) z = Array(y) @test typeof(x) == typeof(z) @test z == x + z = Array(view(x, 1:4)) + @test typeof(x) == typeof(z) + @test z == x x = [1,1,2,missing] y = categorical(x) z = Array(y) @test typeof(x) == typeof(z) @test z ≅ x + z = Array(view(x, 1:4)) + @test typeof(x) == typeof(z) + @test z ≅ x + + x = [1,1,2,2] + y = categorical(x) + z = Vector(y) + @test typeof(x) == typeof(z) + @test z == x + z = Vector(view(x, 1:4)) + @test typeof(x) == typeof(z) + @test z == x + + x = [1,1,2,missing] + y = categorical(x) + z = Vector(y) + @test typeof(x) == typeof(z) + @test z ≅ x + z = Vector(view(x, 1:4)) + @test typeof(x) == typeof(z) + @test z ≅ x + + x = [1 1 2 2] + y = categorical(x) + z = Matrix(y) + @test typeof(x) == typeof(z) + @test z == x + z = Matrix(view(x, :, 1:4)) + @test typeof(x) == typeof(z) + @test z == x + + x = [1 1 2 missing] + y = categorical(x) + z = Matrix(y) + @test typeof(x) == typeof(z) + @test z ≅ x + z = Matrix(view(x, :, 1:4)) + @test typeof(x) == typeof(z) + @test z ≅ x +end + +@testset "convert(Array, ::CatArrOrSub{T}) produces Array{T}" begin + x = [1,1,2,2] + y = categorical(x) + z = convert(Array, y) + @test typeof(x) == typeof(z) + @test z == x + z = Array(view(x, 1:4)) + @test typeof(x) == typeof(z) + @test z == x + + x = [1,1,2,missing] + y = categorical(x) + z = convert(Array, y) + @test typeof(x) == typeof(z) + @test z ≅ x + z = Array(view(x, 1:4)) + @test typeof(x) == typeof(z) + @test z ≅ x + + x = [1,1,2,2] + y = categorical(x) + z = convert(Vector, y) + @test typeof(x) == typeof(z) + @test z == x + z = Vector(view(x, 1:4)) + @test typeof(x) == typeof(z) + @test z == x + + x = [1,1,2,missing] + y = categorical(x) + z = convert(Vector, y) + @test typeof(x) == typeof(z) + @test z ≅ x + z = Vector(view(x, 1:4)) + @test typeof(x) == typeof(z) + @test z ≅ x + + x = [1 1 2 2] + y = categorical(x) + z = convert(Matrix, y) + @test typeof(x) == typeof(z) + @test z == x + z = Matrix(view(x, :, 1:4)) + @test typeof(x) == typeof(z) + @test z == x + + x = [1 1 2 missing] + y = categorical(x) + z = convert(Matrix, y) + @test typeof(x) == typeof(z) + @test z ≅ x + z = Matrix(view(x, :, 1:4)) + @test typeof(x) == typeof(z) + @test z ≅ x end @testset "Array{T} constructors and convert" begin @@ -2071,6 +2171,57 @@ StructTypes.StructType(::Type{<:MyCustomType}) = StructTypes.Struct() @test levels(readx.var) == levels(x.var) end +if Int == Int64 + @testset "writing and reading Arrow files" for f in (identity, passmissing(string)) + xref = f.([3, 1, 4, 1, 4]) + x = categorical(f.([3, 1, 4, 1, 4])) + tbl = mktemp() do path, io + Arrow.write(path, (x=x,)) + Arrow.Table(path) + end + @test tbl.x == x + @test tbl.x isa Arrow.DictEncoded{CategoricalValue{eltype(xref), UInt32}, Int8, + <: CategoricalVector{eltype(xref), UInt32}} + @test copy(tbl.x) == x + @test copy(x) isa CategoricalArray{eltype(xref),1,UInt32} + + x = categorical(f.([3, 1, 4, 1, 4]), compress=true) + tbl = mktemp() do path, io + Arrow.write(path, (x=x,)) + Arrow.Table(path) + end + @test tbl.x == x + @test tbl.x isa Arrow.DictEncoded{CategoricalValue{eltype(xref), UInt8}, Int8, + <: CategoricalVector{eltype(xref), UInt8}} + @test copy(tbl.x) == x + @test copy(x) isa CategoricalArray{eltype(xref),1,UInt8} + + x = categorical(recode(xref, 1 => missing)) + tbl = mktemp() do path, io + Arrow.write(path, (x=x,)) + Arrow.Table(path) + end + @test tbl.x ≅ x + @test tbl.x isa Arrow.DictEncoded{Union{CategoricalValue{eltype(xref), UInt32}, Missing}, + Int8, + <: CategoricalVector{Union{eltype(xref), Missing}, + UInt32}} + @test copy(tbl.x) ≅ x + @test copy(x) isa CategoricalArray{Union{eltype(xref), Missing},1,UInt32} + + recode!(x, missing => f(1)) + tbl = mktemp() do path, io + Arrow.write(path, (x=x,)) + Arrow.Table(path) + end + @test tbl.x == x + @test tbl.x isa Arrow.DictEncoded{Union{CategoricalValue{eltype(xref), UInt32}, Missing}, Int8, + <: CategoricalVector{Union{eltype(xref), Missing}, UInt32}} + @test copy(tbl.x) == x + @test copy(x) isa CategoricalArray{Union{eltype(xref), Missing},1,UInt32} + end +end + @testset "refarray, refvalue, refpool, and invrefpool" begin for y in (categorical(["b", "a", "c", "b"]), view(categorical(["a", "a", "c", "b"]), 1:3), @@ -2273,18 +2424,18 @@ end view(categorical(Union{String, Missing}[missing, "b", "a"], levels=["b", "c", "a"]), 2:3)) @test @inferred(levels(x)) == ["b", "c", "a"] @test levels(x, skipmissing=true) == ["b", "c", "a"] - @test levels(x, skipmissing=true) isa Vector{String} + @test levels(x, skipmissing=true) isa CategoricalVector{String} @test levels(x, skipmissing=false) == ["b", "c", "a"] - @test levels(x, skipmissing=false) isa Vector{Union{String, Missing}} + @test levels(x, skipmissing=false) isa CategoricalVector{Union{String, Missing}} end for x in (categorical(Union{String, Missing}["a", "b", missing], levels=["b", "c", "a"]), view(categorical(Union{String, Missing}["c", "b", missing], levels=["b", "c", "a"]), 2:3)) @test @inferred(levels(x)) == ["b", "c", "a"] @test levels(x, skipmissing=true) == ["b", "c", "a"] - @test levels(x, skipmissing=true) isa Vector{String} + @test levels(x, skipmissing=true) isa CategoricalVector{String} @test levels(x, skipmissing=false) ≅ ["b", "c", "a", missing] - @test levels(x, skipmissing=false) isa Vector{Union{String, Missing}} + @test levels(x, skipmissing=false) isa CategoricalVector{Union{String, Missing}} end end diff --git a/test/14_view.jl b/test/14_view.jl index 79b20812..11853853 100644 --- a/test/14_view.jl +++ b/test/14_view.jl @@ -11,7 +11,8 @@ const ≅ = isequal x = CategoricalArray{Union{T, eltype(a)}}(a, ordered=order) v = view(x, inds) - @test levels(v) === levels(x) + @test levels(x) isa CategoricalVector{nonmissingtype(eltype(a))} + @test levels(v) == levels(x) @test unique(v) == (ndims(v) > 0 ? unique(a[inds]) : [a[inds]]) @test isordered(v) === isordered(x) end diff --git a/test/15_extras.jl b/test/15_extras.jl index 472885a1..80dc14b7 100644 --- a/test/15_extras.jl +++ b/test/15_extras.jl @@ -1,32 +1,44 @@ module TestExtras using Test using CategoricalArrays +using StatsBase +using Missings const ≅ = isequal @testset "cut($(Union{Int, T})[...])" for T in (Union{}, Missing) x = @inferred cut(Vector{Union{Int, T}}([2, 3, 5]), [1, 3, 6]) - @test x == ["[1, 3)", "[3, 6)", "[3, 6)"] + @test x == ["[1, 3)", "[3, 6]", "[3, 6]"] @test isa(x, CategoricalVector{Union{String, T}}) @test isordered(x) - @test levels(x) == ["[1, 3)", "[3, 6)"] + @test levels(x) == ["[1, 3)", "[3, 6]"] + + @test cut(Vector{Union{T, Int}}([2, 3, 5]), [2, 5], extend=false) == + ["[2, 5]", "[2, 5]", "[2, 5]"] err = @test_throws ArgumentError cut(Vector{Union{T, Int}}([2, 3, 5]), [3, 6]) @test err.value.msg == "value 2 (at index 1) does not fall inside the breaks: adapt them manually, or pass extend=true or extend=missing" - err = @test_throws ArgumentError cut(Vector{Union{T, Int}}([2, 3, 5]), [2, 5]) - @test err.value.msg == "value 5 (at index 3) does not fall inside the breaks: adapt them manually, or pass extend=true or extend=missing" - if T === Missing x = @inferred cut(Vector{Union{T, Int}}([2, 3, 5]), [2, 5], extend=missing) else x = cut(Vector{Union{T, Int}}([2, 3, 5]), [2, 5], extend=missing) end - @test x ≅ ["[2, 5)", "[2, 5)", missing] + @test x ≅ ["[2, 5]", "[2, 5]", "[2, 5]"] + @test isa(x, CategoricalVector{Union{String, Missing}}) + @test isordered(x) + @test levels(x) == ["[2, 5]"] + + if T === Missing + x = @inferred cut(Vector{Union{T, Int}}([2, 3, 6]), [2, 5], extend=missing) + else + x = cut(Vector{Union{T, Int}}([2, 3, 6]), [2, 5], extend=missing) + end + @test x ≅ ["[2, 5]", "[2, 5]", missing] @test isa(x, CategoricalVector{Union{String, Missing}}) @test isordered(x) - @test levels(x) == ["[2, 5)"] + @test levels(x) == ["[2, 5]"] x = @inferred cut(Vector{Union{T, Int}}([2, 3, 5]), [3, 6], extend=true) @test x == ["[2, 3)", "[3, 6]", "[3, 6]"] @@ -40,10 +52,10 @@ const ≅ = isequal @test levels(x) == ["[2, 3)", "[3, 6]"] x = @inferred cut(Vector{Union{T, Int}}([1, 2, 4]), [1, 3, 6]) - @test x == ["[1, 3)", "[1, 3)", "[3, 6)"] + @test x == ["[1, 3)", "[1, 3)", "[3, 6]"] @test isa(x, CategoricalVector{Union{String, T}}) @test isordered(x) - @test levels(x) == ["[1, 3)", "[3, 6)"] + @test levels(x) == ["[1, 3)", "[3, 6]"] x = @inferred cut(Vector{Union{T, Int}}([1, 2, 4]), [3, 6], extend=true) @test x == ["[1, 3)", "[1, 3)", "[3, 6]"] @@ -67,10 +79,10 @@ const ≅ = isequal breaks = [18, 25, 35, 60, 100] x = @inferred cut(Vector{Union{T, Int}}(ages), breaks) @test x == ["[18, 25)", "[18, 25)", "[25, 35)", "[25, 35)", "[18, 25)", "[18, 25)", - "[35, 60)", "[25, 35)", "[60, 100)", "[35, 60)", "[35, 60)", "[25, 35)"] + "[35, 60)", "[25, 35)", "[60, 100]", "[35, 60)", "[35, 60)", "[25, 35)"] @test isa(x, CategoricalVector{Union{String, T}}) @test isordered(x) - @test levels(x) == ["[18, 25)", "[25, 35)", "[35, 60)", "[60, 100)"] + @test levels(x) == ["[18, 25)", "[25, 35)", "[35, 60)", "[60, 100]"] breaks = [1, 6, 3] # Unsorted breaks labels = ["b", "a"] # Differs from lexical ordering @@ -83,10 +95,10 @@ const ≅ = isequal @test levels(x) == ["b", "a"] x = @inferred cut(Matrix{Union{Float64, T}}([-1.1 3.0; 1.456 10.394]), [-2.134, 3.0, 12.5]) - @test x == ["[-2.134, 3.0)" "[3.0, 12.5)"; "[-2.134, 3.0)" "[3.0, 12.5)"] + @test x == ["[-2.13, 3)" "[3, 12.5]"; "[-2.13, 3)" "[3, 12.5]"] @test isa(x, CategoricalMatrix{Union{String, T}}) @test isordered(x) - @test levels(x) == ["[-2.134, 3.0)", "[3.0, 12.5)"] + @test levels(x) == ["[-2.13, 3)", "[3, 12.5]"] labels = 0:2:8 x = @inferred cut(Vector{Union{T, Int}}(1:8), 0:2:10, labels=labels) @@ -101,9 +113,6 @@ const ≅ = isequal @test isa(x, CategoricalVector{Union{Int, String, T}}) @test isordered(x) @test levels(x) == [0, "2", 4, "6", 8] - - @test_throws ArgumentError cut([-0.0, 0.0], 2) - @test_throws ArgumentError cut([-0.0, 0.0], 2, labels=[-0.0, 0.0]) end @testset "cut with missing values in input" begin @@ -120,22 +129,27 @@ end @testset "cut([5, 4, 3, 2], 2)" begin x = @inferred cut([5, 4, 3, 2], 2) - @test x == ["Q2: [3.5, 5.0]", "Q2: [3.5, 5.0]", "Q1: [2.0, 3.5)", "Q1: [2.0, 3.5)"] + @test x == ["[4, 5]", "[4, 5]", "[2, 4)", "[2, 4)"] @test isa(x, CategoricalArray) @test isordered(x) - @test levels(x) == ["Q1: [2.0, 3.5)", "Q2: [3.5, 5.0]"] + @test levels(x) == ["[2, 4)", "[4, 5]"] end @testset "cut(x, n) with missing values" begin x = @inferred cut([5, 4, 3, missing, 2], 2) - @test x ≅ ["Q2: [3.5, 5.0]", "Q2: [3.5, 5.0]", "Q1: [2.0, 3.5)", missing, "Q1: [2.0, 3.5)"] + @test x ≅ ["[4, 5]", "[4, 5]", "[2, 4)", missing, "[2, 4)"] @test isa(x, CategoricalArray) @test isordered(x) - @test levels(x) == ["Q1: [2.0, 3.5)", "Q2: [3.5, 5.0]"] + @test levels(x) == ["[2, 4)", "[4, 5]"] +end + +@testset "cut(x, n) with invalid n" begin + @test_throws ArgumentError cut(1:10, 0) + @test_throws ArgumentError cut(1:10, -1) end @testset "cut with formatter function" begin - my_formatter(from, to, i; leftclosed, rightclosed) = "$i: $from -- $to" + my_formatter(from, to, i; leftclosed, rightclosed, sigdigits) = "$i: $from -- $to" x = 0.15:0.20:0.95 p = [0, 0.4, 0.8, 1.0] @@ -143,20 +157,24 @@ end a = @inferred cut(x, p, labels=my_formatter) @test a == ["1: 0.0 -- 0.4", "1: 0.0 -- 0.4", "2: 0.4 -- 0.8", "2: 0.4 -- 0.8", "3: 0.8 -- 1.0"] + my_old_formatter(from, to, i; leftclosed, rightclosed) = "$i: $from -- $to" + a = @test_deprecated r"`labels`.*" cut(x, p, labels=my_old_formatter) + @test a == ["1: 0.0 -- 0.4", "1: 0.0 -- 0.4", "2: 0.4 -- 0.8", "2: 0.4 -- 0.8", "3: 0.8 -- 1.0"] + # GH 274 - my_formatter_2(from, to, i; leftclosed, rightclosed) = "$i: $(from+1) -- $(to+1)" + my_formatter_2(from, to, i; leftclosed, rightclosed, sigdigits) = "$i: $(from+1) -- $(to+1)" a = @inferred cut(x, p, labels=my_formatter_2) @test a == ["1: 1.0 -- 1.4", "1: 1.0 -- 1.4", "2: 1.4 -- 1.8", "2: 1.4 -- 1.8", "3: 1.8 -- 2.0"] for T in (Union{}, Missing) - labels = (from, to, i; leftclosed, rightclosed) -> (to+from)/2 + labels = (from, to, i; leftclosed, rightclosed, sigdigits) -> (to+from)/2 a = @inferred cut(Vector{Union{T, Int}}(1:8), 0:2:10, labels=labels) @test a == [1.0, 3.0, 3.0, 5.0, 5.0, 7.0, 7.0, 9.0] @test isa(a, CategoricalVector{Union{Float64, T}}) @test isordered(a) @test levels(a) == [1.0, 3.0, 5.0, 7.0, 9.0] - labels = (from, to, i; leftclosed, rightclosed) -> "$((to+from)/2)" + labels = (from, to, i; leftclosed, rightclosed, sigdigits) -> "$((to+from)/2)" a = @inferred cut(Vector{Union{T, Int}}(1:8), 0:2:10, labels=labels) @test a == string.([1.0, 3.0, 3.0, 5.0, 5.0, 7.0, 7.0, 9.0]) @test isa(a, CategoricalVector{Union{String, T}}) @@ -175,11 +193,20 @@ end x = [zeros(10); ones(10)] @test_throws ArgumentError cut(x, [0, 0.1, 0.1, 10]) @test_throws ArgumentError cut(x, 10) + y = cut(x, [0, 0.1, 10, 10]) + @test y == [fill("[0, 0.1)", 10); fill("[0.1, 10)", 10)] + @test levels(y) == ["[0, 0.1)", "[0.1, 10)", "[10, 10]"] @test_throws ArgumentError cut(1:10, [1, 5, 5, 11]) y = cut(1:10, [1, 5, 5, 11], allowempty=true) @test y == cut(1:10, [1, 5, 11]) - @test levels(y) == ["[1, 5)", "(5, 5)", "[5, 11)"] + @test levels(y) == ["[1, 5)", "(5, 5)", "[5, 11]"] + y = cut(1:10, [1, 5, 11, 11]) + @test y == [fill("[1, 5)", 4); fill("[5, 11)", 6)] + @test levels(y) == ["[1, 5)", "[5, 11)", "[11, 11]"] + y = cut(1:10, [1, 5, 10, 10]) + @test y == [fill("[1, 5)", 4); fill("[5, 10)", 5); "[10, 10]"] + @test levels(y) == ["[1, 5)", "[5, 10)", "[10, 10]"] @test_throws ArgumentError cut(1:10, [1, 5, 5, 5, 11]) @test_throws ArgumentError cut(1:10, [1, 5, 5, 11], @@ -191,29 +218,29 @@ end @test_throws ArgumentError cut(1:10, [1, 5, 5, 11], labels=string.(1:3)) y = cut(1:10, [1, 5, 5, 11], allowempty=true, labels=string.(1:3)) - @test y == recode(cut(1:10, [1, 5, 11]), "[1, 5)" => "1", "[5, 11)" => "3") + @test y == recode(cut(1:10, [1, 5, 11]), "[1, 5)" => "1", "[5, 11]" => "3") @test levels(y) == string.(1:3) @test_throws ArgumentError cut(1:10, [1, 5, 5, 5, 11], labels=string.(1:4)) y = cut(1:10, [1, 5, 5, 5, 11], allowempty=true, labels=string.(1:4)) - @test y == recode(cut(1:10, [1, 5, 11]), "[1, 5)" => "1", "[5, 11)" => "4") + @test y == recode(cut(1:10, [1, 5, 11]), "[1, 5)" => "1", "[5, 11]" => "4") @test levels(y) == string.(1:4) @test_throws ArgumentError cut(1:10, [1, 5, 5, 5, 5, 11], labels=string.(1:5)) y = cut(1:10, [1, 5, 5, 5, 5, 11], allowempty=true, labels=string.(1:5)) - @test y == recode(cut(1:10, [1, 5, 11]), "[1, 5)" => "1", "[5, 11)" => "5") + @test y == recode(cut(1:10, [1, 5, 11]), "[1, 5)" => "1", "[5, 11]" => "5") @test levels(y) == string.(1:5) @test_throws ArgumentError cut(1:10, [1, 3, 3, 5, 5, 11], labels=string.(1:5)) y = cut(1:10, [1, 3, 3, 5, 5, 11], allowempty=true, labels=string.(1:5)) @test y == recode(cut(1:10, [1, 3, 5, 11]), - "[1, 3)" => "1", "[3, 5)" => "3", "[5, 11)" => "5") + "[1, 3)" => "1", "[3, 5)" => "3", "[5, 11]" => "5") @test levels(y) == string.(1:5) @test_throws ArgumentError cut(1:10, [1, 3, 3, 3, 5, 5, 5, 11], labels=string.(1:7)) y = cut(1:10, [1, 3, 3, 3, 5, 5, 5, 11], allowempty=true, labels=string.(1:7)) @test y == recode(cut(1:10, [1, 3, 5, 11]), - "[1, 3)" => "1", "[3, 5)" => "4", "[5, 11)" => "7") + "[1, 3)" => "1", "[3, 5)" => "4", "[5, 11]" => "7") @test levels(y) == string.(1:7) @test_throws ArgumentError cut(1:10, [1, 3, 5, 5, 11], @@ -230,8 +257,60 @@ end @test_throws ArgumentError cut(1:8, 0:2:10, labels=[0, 1, 1, 2, 3]) @test_throws ArgumentError cut(1:8, [0, 2, 2, 6, 8, 10], labels=[0, 1, 1, 2, 3], allowempty=true) - fmt = (from, to, i; leftclosed, rightclosed) -> (i % 2 == 0 ? to : 0.0) + fmt = (from, to, i; leftclosed, rightclosed, sigdigits) -> (i % 2 == 0 ? to : 0.0) @test_throws ArgumentError cut(1:8, 0:2:10, labels=fmt) + + @test_throws ArgumentError cut([fill(1, 10); 4], 2) + x = cut([fill(1, 10); 4], 2, allowempty=true) + @test unique(x) == ["2: [1, 4]"] + @test levels(x) == ["1: (1, 1)", "2: [1, 4]"] + @test_throws ArgumentError cut([fill(1, 10); 4], 3) + x = cut([fill(1, 10); 4], 3, allowempty=true) + @test unique(x) == ["3: [1, 4]"] + @test levels(x) == ["1: (1, 1)", "2: (1, 1)", "3: [1, 4]"] + + x = cut([fill(4, 10); 1], 2) + @test x == [fill("[4, 4]", 10); "[1, 4)"] + @test levels(x) == ["[1, 4)"; "[4, 4]"] + @test_throws ArgumentError cut([fill(4, 10); 1], 3) + x = cut([fill(4, 10); 1], 3, allowempty=true) + @test x == [fill("3: [4, 4]", 10); "1: [1, 4)"] + @test levels(x) == ["1: [1, 4)", "2: (4, 4)", "3: [4, 4]"] + + x = cut([fill(1, 5); fill(4, 5)], 2) + @test x == [fill("[1, 4)", 5); fill("[4, 4]", 5)] + @test levels(x) == ["[1, 4)", "[4, 4]"] + @test_throws ArgumentError cut([fill(1, 5); fill(4, 5)], 3) + x = cut([fill(1, 5); fill(4, 5)], 3, allowempty=true) + @test x == [fill("2: [1, 4)", 5); fill("3: [4, 4]", 5)] + @test levels(x) == ["1: (1, 1)", "2: [1, 4)", "3: [4, 4]"] +end + +@testset "cut with -0.0" begin + x = cut([-0.0, 0.0, 0.0, -0.0], 2) + @test x == ["[-0, 0)", "[0, 0]", "[0, 0]", "[-0, 0)"] + @test levels(x) == ["[-0, 0)", "[0, 0]"] + + x = cut([-0.0, 0.0, 0.0, -0.0], [-0.0, 0.0, 0.0]) + @test x == ["[-0, 0)", "[0, 0]", "[0, 0]", "[-0, 0)"] + @test levels(x) == ["[-0, 0)", "[0, 0]"] + + x = cut([-0.0, 0.0, 0.0, -0.0], [-0.0, 0.0]) + @test x == fill("[-0, 0]", 4) + @test levels(x) == ["[-0, 0]"] + + x = cut([-0.0, 0.0, 0.0, -0.0], [0.0], extend=true) + @test x == fill("[-0, 0]", 4) + @test levels(x) == ["[-0, 0]"] + + x = cut([-0.0, 0.0, 0.0, -0.0], [-0.0], extend=true) + @test x == fill("[-0, 0]", 4) + @test levels(x) == ["[-0, 0]"] + + x = cut([-0.0, 0.0, 0.0, -0.0], 2, labels=[-0.0, 0.0]) + @test x == [-0.0, 0.0, 0.0, -0.0] + + @test_throws ArgumentError cut([-0.0, 0.0, 0.0, -0.0], [-0.0, -0.0, 0.0]) end @testset "cut with extend=true" begin @@ -255,15 +334,117 @@ end end @testset "cut with extend=missing" begin - x = @inferred cut([-0.0, 0.0, 1.0, 2.0, 3.0, 4.0], [-0.0, 0.0, 3.0], + x = @inferred cut([-0.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0], [-0.0, 0.0, 3.0], labels=[-0.0, 0.0], extend=missing) - @test x ≅ [-0.0, 0.0, 0.0, 0.0, missing, missing] + @test x ≅ [-0.0, 0.0, 0.0, 0.0, 0.0, missing, missing] @test x isa CategoricalArray{Union{Missing, Float64},1,UInt32} @test isordered(x) @test levels(x) == [-0.0, 0.0] x = @inferred cut(-1:0.5:1, [0, 1], extend=true) - @test x == ["[-1.0, 0.0)", "[-1.0, 0.0)", "[0.0, 1.0]", "[0.0, 1.0]", "[0.0, 1.0]"] + @test x == ["[-1, 0)", "[-1, 0)", "[0, 1]", "[0, 1]", "[0, 1]"] +end + +@testset "cut with NaN and Inf" begin + @test_throws ArgumentError("NaN values are not allowed in input vector") cut([1, NaN, 2, 3], [1, 10]) + @test_throws ArgumentError("NaN values are not allowed in input vector") cut([1, NaN, 2, 3], [1], extend=true) + @test_throws ArgumentError("NaN values are not allowed in input vector") cut([1, NaN, 2, 3], 2) + @test_throws ArgumentError("NaN values are not allowed in breaks") cut([1, 2], [1, NaN]) + + x = cut([1, Inf], [1], extend=true) + @test x ≅ ["[1, Inf]", "[1, Inf]"] + @test levels(x) == ["[1, Inf]"] + + x = cut([1, -Inf], [1], extend=true) + @test x ≅ ["[-Inf, 1]", "[-Inf, 1]"] + @test levels(x) == ["[-Inf, 1]"] + + x = cut([1:5; Inf], [1, 2, Inf]) + @test x ≅ ["[1, 2)"; fill("[2, Inf]", 5)] + @test levels(x) == ["[1, 2)", "[2, Inf]"] + + x = cut([1:5; -Inf], [-Inf, 2, 5]) + @test x ≅ ["[-Inf, 2)"; fill("[2, 5]", 4); "[-Inf, 2)"] + @test levels(x) == ["[-Inf, 2)", "[2, 5]"] + + x = cut([1:5; Inf], 2) + @test x ≅ [fill("[1, 4)", 3); fill("[4, Inf]", 3)] + @test levels(x) == ["[1, 4)", "[4, Inf]"] + + x = cut([1:5; -Inf], 2) + @test x ≅ [fill("[-Inf, 3)", 2); fill("[3, 5]", 3); "[-Inf, 3)"] + @test levels(x) == ["[-Inf, 3)", "[3, 5]"] end +@testset "cut when quantile falls exactly on a data value" begin + x = cut([11, 14, 43, 54, 54, 56, 73, 79, 84, 84], 3) + @test x == + ["[11, 54)", "[11, 54)", "[11, 54)", + "[54, 73)", "[54, 73)", "[54, 73)", + "[73, 84]", "[73, 84]", "[73, 84]", "[73, 84]"] + @test levels(x) == ["[11, 54)", "[54, 73)", "[73, 84]"] end + +@testset "cut computation of sigdigits" begin + x = cut([1.2, 1.3, 2], 2) + @test levels(x) == ["[1.2, 1.3)", "[1.3, 2]"] + + x = cut([1.0, 2.0, 3.0], 2) + @test levels(x) == ["[1, 2)", "[2, 3]"] + + x = cut([1.00002, 1.00003, 2], 2) + @test levels(x) == ["[1.00002, 1.00003)", "[1.00003, 2]"] + + x = cut([1.00002, 1.00003, 1.00005, 2], 2) + @test levels(x) == ["[1, 1.0001)", "[1.0001, 2]"] + + x = cut([1.00001, 1.00002, 1.00002, 2], 2) + @test levels(x) == ["[1.00001, 1.00002)", "[1.00002, 2]"] + + x = cut([1.00001, 1.00003, 1.1, 2], 2) + @test levels(x) == ["[1, 1.1)", "[1.1, 2]"] + + # @sprintf with %g uses scientific notation even in some cases + # where classic notation would be shorter + x = cut([1.0, 10.0, 100.0, 1000.0], [1.0, 10.0, 100.0, 1000.0]) + @test levels(x) == ["[1, 10)", "[10, 100)", "[100, 1e+03]"] + # But integers are rendered using plain `string` + x = cut([1, 10, 100], [1, 10, 100, 1000]) + @test levels(x) == ["[1, 10)", "[10, 100)", "[100, 1000]"] + + # Extreme case + x = cut([8.85718832925723e-7, 8.572446994052413e-7, 1.40217695121027e-7, 8.966449714804087e-7, + 3.070384341319470e-7, 3.070384341319471e-7, 1.8520709563325888e-7, 5.630461710066611e-7, + 6.781422109070843e-7, 4.776113711396994e-7, 0.2538909094146984, 0.5249665525921473, + 0.8321957380046366, 0.9648282851978118, 0.36084175275805797, 0.7851054639425253, + 0.6875195857202754, 0.614940093507575, 0.6224944997292978, 0.6055683461790675, + 5.349085340927365e11, 1.3471583229449602e11, 6.538893396835975e11, 4.826316844547661e11, + 8.803607035550856e11, 1.8174694671397316e10, 1.6709745443719125e11, 3.2050577954311835e11, + 1.6134999167460663e11, 7.396308745225059e11], 3) + @test levels(x) == ["[1.4e-07, 0.254)", "[0.254, 1.82e+10)", "[1.82e+10, 8.8e+11]"] + +end + +@testset "cut with weighted quantiles" begin + @test_throws ArgumentError cut(1:3, 3, weights=1:3) + + x = collect(Float64, 1:100) + w = fweights(repeat(1:10, inner=10)) + y = cut(x, 10, weights=w) + @test levelcode.(y) == levelcode.(cut(x, quantile(x, w, (0:10)./10))) + @test levels(y) == ["[1, 29)", "[29, 43)", "[43, 53)", "[53, 62)", "[62, 70)", + "[70, 77)", "[77, 83)", "[83, 89)", "[89, 95)", "[95, 100]"] + + mx = allowmissing(x) + mx[2] = mx[10] = missing + nm_inds = .!ismissing.(mx) + y = cut(mx, 10, weights=w) + @test levelcode.(y) ≅ levelcode.(cut(mx, quantile(x[nm_inds], w[nm_inds], (0:10)./10))) + @test levels(y) == ["[1, 30)", "[30, 43)", "[43, 53)", "[53, 62)", "[62, 70)", + "[70, 77)", "[77, 83)", "[83, 89)", "[89, 95)", "[95, 100]"] + + x[5] = NaN + @test_throws ArgumentError cut(x, 3, weights=w) +end + +end \ No newline at end of file diff --git a/test/17_deprecated.jl b/test/17_deprecated.jl deleted file mode 100644 index bc492484..00000000 --- a/test/17_deprecated.jl +++ /dev/null @@ -1,16 +0,0 @@ -module TestExtras -using Test -using CategoricalArrays - -const ≅ = isequal - -@testset "allow_missing argument" begin - x = categorical(["a", "b", missing]) - levels!(x, ["a"], allow_missing=true) - @test x ≅ ["a", missing, missing] - - x = cut([1, missing, 100], [1, 2], allow_missing=true) - @test x ≅ ["[1, 2)", missing, missing] -end - -end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 142bd15f..088cfc9e 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -10,6 +10,8 @@ module TestCategoricalArrays using Test using CategoricalArrays + const ≊ = isequal + tests = [ "01_value.jl", "04_constructors.jl", @@ -25,8 +27,7 @@ module TestCategoricalArrays "13_arraycommon.jl", "14_view.jl", "15_extras.jl", - "16_recode.jl", - "17_deprecated.jl" + "16_recode.jl" ] @testset "$test" for test in tests