From 246e8f0c86fbc08a664266c9e0edffbc24727298 Mon Sep 17 00:00:00 2001 From: Andreas Noack Date: Wed, 10 May 2023 13:27:46 +0200 Subject: [PATCH 01/25] Update CompatHelper.yml --- .github/workflows/CompatHelper.yml | 47 ++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml index 7344a549..8d889a9d 100644 --- a/.github/workflows/CompatHelper.yml +++ b/.github/workflows/CompatHelper.yml @@ -1,24 +1,39 @@ name: CompatHelper - on: schedule: - - cron: '00 00 * * *' - + - cron: 0 0 * * * + workflow_dispatch: +permissions: + contents: write + pull-requests: write jobs: - build: - runs-on: ${{ matrix.os }} - strategy: - matrix: - julia-version: [1.2.0] - julia-arch: [x86] - os: [ubuntu-latest] + CompatHelper: + runs-on: ubuntu-latest steps: - - uses: julia-actions/setup-julia@latest + - name: Check if Julia is already available in the PATH + id: julia_in_path + run: which julia + continue-on-error: true + - name: Install Julia, but only if it is not already available in the PATH + uses: julia-actions/setup-julia@v1 with: - version: ${{ matrix.julia-version }} - - name: Pkg.add("CompatHelper") - run: julia -e 'using Pkg; Pkg.add("CompatHelper")' - - name: CompatHelper.main() + version: '1' + arch: ${{ runner.arch }} + if: steps.julia_in_path.outcome != 'success' + - name: "Install CompatHelper" + run: | + import Pkg + name = "CompatHelper" + uuid = "aa819f21-2bde-4658-8897-bab36330d9b7" + version = "3" + Pkg.add(; name, uuid, version) + shell: julia --color=yes {0} + - name: "Run CompatHelper" + run: | + import CompatHelper + CompatHelper.main() + shell: julia --color=yes {0} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: julia -e 'using CompatHelper; CompatHelper.main()' + COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }} + # COMPATHELPER_PRIV: ${{ secrets.COMPATHELPER_PRIV }} From d61d91129903db0171b5da642ca6d2cd7da650a8 Mon Sep 17 00:00:00 2001 From: Matthijs Cox <79519355+matthijscox-asml@users.noreply.github.com> Date: Fri, 27 Dec 2024 10:12:13 +0100 Subject: [PATCH 02/25] remove JET runtime dispatch error (#408) --- src/pool.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/pool.jl b/src/pool.jl index 1018cfee..0ece21ce 100644 --- a/src/pool.jl +++ b/src/pool.jl @@ -67,8 +67,9 @@ it doesn't do this itself to avoid doing a dict lookup twice i = R(n + 1) push!(pool.levels, x) - if pool.hash !== nothing - pool.hash = hash(x, pool.hash) + pool_hash = pool.hash + if pool_hash !== nothing + pool.hash = hash(x, pool_hash) end pool.equalto = C_NULL pool.subsetof = C_NULL From bbac8dc79ae02d89d8728d0c8155afc7e8d9eedf Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Fri, 27 Dec 2024 22:57:40 +0100 Subject: [PATCH 03/25] Make `cut` close last interval on the right (#409) This is much more useful, though slightly breaking. --- src/extras.jl | 12 +++++----- test/15_extras.jl | 52 ++++++++++++++++++++++++++----------------- test/17_deprecated.jl | 2 +- 3 files changed, 37 insertions(+), 29 deletions(-) diff --git a/src/extras.jl b/src/extras.jl index 137875b8..f536f06f 100644 --- a/src/extras.jl +++ b/src/extras.jl @@ -11,9 +11,9 @@ function fill_refs!(refs::AbstractArray, X::AbstractArray, if ismissing(x) refs[i] = 0 - elseif extend === true && x == upper + elseif x == upper refs[i] = n-1 - elseif extend !== true && !(lower <= x < upper) + elseif extend !== true && !(lower <= x <= upper) extend === missing || throw(ArgumentError("value $x (at index $i) does not fall inside the breaks: " * "adapt them manually, or pass extend=true or extend=missing")) @@ -41,8 +41,7 @@ Cut a numeric array into intervals at values `breaks` and return an ordered `CategoricalArray` indicating the interval into which each entry falls. Intervals are of the form `[lower, upper)`, i.e. the lower bound is included and the upper bound is excluded, except -if `extend=true` the last interval, which is then closed on both ends, -i.e. `[lower, upper]`. +the last interval, which is closed on both ends, i.e. `[lower, upper]`. If `x` accepts missing values (i.e. `eltype(x) >: Missing`) the returned array will also accept them. @@ -50,8 +49,7 @@ also accept them. # Keyword arguments * `extend::Union{Bool, Missing}=false`: when `false`, an error is raised if some values in `x` fall outside of the breaks; when `true`, breaks are automatically added to include - all values in `x`, and the upper bound is included in the last interval; when `missing`, - values outside of the breaks generate `missing` entries. + all values in `x`; when `missing`, values outside of the breaks generate `missing` entries. * `labels::Union{AbstractVector, Function}`: a vector of strings, characters or numbers giving the names to use for the intervals; or a function `f(from, to, i; leftclosed, rightclosed)` that generates @@ -200,7 +198,7 @@ function _cut(x::AbstractArray{T, N}, breaks::AbstractVector, end levs[end] = labels(from[end], to[end], n-1, leftclosed=breaks[end-1] != breaks[end], - rightclosed=coalesce(extend, false)) + rightclosed=true) else length(labels) == n-1 || throw(ArgumentError("labels must be of length $(n-1), but got length $(length(labels))")) diff --git a/test/15_extras.jl b/test/15_extras.jl index 472885a1..14fb4352 100644 --- a/test/15_extras.jl +++ b/test/15_extras.jl @@ -6,27 +6,37 @@ const ≅ = isequal @testset "cut($(Union{Int, T})[...])" for T in (Union{}, Missing) x = @inferred cut(Vector{Union{Int, T}}([2, 3, 5]), [1, 3, 6]) - @test x == ["[1, 3)", "[3, 6)", "[3, 6)"] + @test x == ["[1, 3)", "[3, 6]", "[3, 6]"] @test isa(x, CategoricalVector{Union{String, T}}) @test isordered(x) - @test levels(x) == ["[1, 3)", "[3, 6)"] + @test levels(x) == ["[1, 3)", "[3, 6]"] + + @test cut(Vector{Union{T, Int}}([2, 3, 5]), [2, 5], extend=false) == + ["[2, 5]", "[2, 5]", "[2, 5]"] err = @test_throws ArgumentError cut(Vector{Union{T, Int}}([2, 3, 5]), [3, 6]) @test err.value.msg == "value 2 (at index 1) does not fall inside the breaks: adapt them manually, or pass extend=true or extend=missing" - err = @test_throws ArgumentError cut(Vector{Union{T, Int}}([2, 3, 5]), [2, 5]) - @test err.value.msg == "value 5 (at index 3) does not fall inside the breaks: adapt them manually, or pass extend=true or extend=missing" - if T === Missing x = @inferred cut(Vector{Union{T, Int}}([2, 3, 5]), [2, 5], extend=missing) else x = cut(Vector{Union{T, Int}}([2, 3, 5]), [2, 5], extend=missing) end - @test x ≅ ["[2, 5)", "[2, 5)", missing] + @test x ≅ ["[2, 5]", "[2, 5]", "[2, 5]"] @test isa(x, CategoricalVector{Union{String, Missing}}) @test isordered(x) - @test levels(x) == ["[2, 5)"] + @test levels(x) == ["[2, 5]"] + + if T === Missing + x = @inferred cut(Vector{Union{T, Int}}([2, 3, 6]), [2, 5], extend=missing) + else + x = cut(Vector{Union{T, Int}}([2, 3, 6]), [2, 5], extend=missing) + end + @test x ≅ ["[2, 5]", "[2, 5]", missing] + @test isa(x, CategoricalVector{Union{String, Missing}}) + @test isordered(x) + @test levels(x) == ["[2, 5]"] x = @inferred cut(Vector{Union{T, Int}}([2, 3, 5]), [3, 6], extend=true) @test x == ["[2, 3)", "[3, 6]", "[3, 6]"] @@ -40,10 +50,10 @@ const ≅ = isequal @test levels(x) == ["[2, 3)", "[3, 6]"] x = @inferred cut(Vector{Union{T, Int}}([1, 2, 4]), [1, 3, 6]) - @test x == ["[1, 3)", "[1, 3)", "[3, 6)"] + @test x == ["[1, 3)", "[1, 3)", "[3, 6]"] @test isa(x, CategoricalVector{Union{String, T}}) @test isordered(x) - @test levels(x) == ["[1, 3)", "[3, 6)"] + @test levels(x) == ["[1, 3)", "[3, 6]"] x = @inferred cut(Vector{Union{T, Int}}([1, 2, 4]), [3, 6], extend=true) @test x == ["[1, 3)", "[1, 3)", "[3, 6]"] @@ -67,10 +77,10 @@ const ≅ = isequal breaks = [18, 25, 35, 60, 100] x = @inferred cut(Vector{Union{T, Int}}(ages), breaks) @test x == ["[18, 25)", "[18, 25)", "[25, 35)", "[25, 35)", "[18, 25)", "[18, 25)", - "[35, 60)", "[25, 35)", "[60, 100)", "[35, 60)", "[35, 60)", "[25, 35)"] + "[35, 60)", "[25, 35)", "[60, 100]", "[35, 60)", "[35, 60)", "[25, 35)"] @test isa(x, CategoricalVector{Union{String, T}}) @test isordered(x) - @test levels(x) == ["[18, 25)", "[25, 35)", "[35, 60)", "[60, 100)"] + @test levels(x) == ["[18, 25)", "[25, 35)", "[35, 60)", "[60, 100]"] breaks = [1, 6, 3] # Unsorted breaks labels = ["b", "a"] # Differs from lexical ordering @@ -83,10 +93,10 @@ const ≅ = isequal @test levels(x) == ["b", "a"] x = @inferred cut(Matrix{Union{Float64, T}}([-1.1 3.0; 1.456 10.394]), [-2.134, 3.0, 12.5]) - @test x == ["[-2.134, 3.0)" "[3.0, 12.5)"; "[-2.134, 3.0)" "[3.0, 12.5)"] + @test x == ["[-2.134, 3.0)" "[3.0, 12.5]"; "[-2.134, 3.0)" "[3.0, 12.5]"] @test isa(x, CategoricalMatrix{Union{String, T}}) @test isordered(x) - @test levels(x) == ["[-2.134, 3.0)", "[3.0, 12.5)"] + @test levels(x) == ["[-2.134, 3.0)", "[3.0, 12.5]"] labels = 0:2:8 x = @inferred cut(Vector{Union{T, Int}}(1:8), 0:2:10, labels=labels) @@ -179,7 +189,7 @@ end @test_throws ArgumentError cut(1:10, [1, 5, 5, 11]) y = cut(1:10, [1, 5, 5, 11], allowempty=true) @test y == cut(1:10, [1, 5, 11]) - @test levels(y) == ["[1, 5)", "(5, 5)", "[5, 11)"] + @test levels(y) == ["[1, 5)", "(5, 5)", "[5, 11]"] @test_throws ArgumentError cut(1:10, [1, 5, 5, 5, 11]) @test_throws ArgumentError cut(1:10, [1, 5, 5, 11], @@ -191,29 +201,29 @@ end @test_throws ArgumentError cut(1:10, [1, 5, 5, 11], labels=string.(1:3)) y = cut(1:10, [1, 5, 5, 11], allowempty=true, labels=string.(1:3)) - @test y == recode(cut(1:10, [1, 5, 11]), "[1, 5)" => "1", "[5, 11)" => "3") + @test y == recode(cut(1:10, [1, 5, 11]), "[1, 5)" => "1", "[5, 11]" => "3") @test levels(y) == string.(1:3) @test_throws ArgumentError cut(1:10, [1, 5, 5, 5, 11], labels=string.(1:4)) y = cut(1:10, [1, 5, 5, 5, 11], allowempty=true, labels=string.(1:4)) - @test y == recode(cut(1:10, [1, 5, 11]), "[1, 5)" => "1", "[5, 11)" => "4") + @test y == recode(cut(1:10, [1, 5, 11]), "[1, 5)" => "1", "[5, 11]" => "4") @test levels(y) == string.(1:4) @test_throws ArgumentError cut(1:10, [1, 5, 5, 5, 5, 11], labels=string.(1:5)) y = cut(1:10, [1, 5, 5, 5, 5, 11], allowempty=true, labels=string.(1:5)) - @test y == recode(cut(1:10, [1, 5, 11]), "[1, 5)" => "1", "[5, 11)" => "5") + @test y == recode(cut(1:10, [1, 5, 11]), "[1, 5)" => "1", "[5, 11]" => "5") @test levels(y) == string.(1:5) @test_throws ArgumentError cut(1:10, [1, 3, 3, 5, 5, 11], labels=string.(1:5)) y = cut(1:10, [1, 3, 3, 5, 5, 11], allowempty=true, labels=string.(1:5)) @test y == recode(cut(1:10, [1, 3, 5, 11]), - "[1, 3)" => "1", "[3, 5)" => "3", "[5, 11)" => "5") + "[1, 3)" => "1", "[3, 5)" => "3", "[5, 11]" => "5") @test levels(y) == string.(1:5) @test_throws ArgumentError cut(1:10, [1, 3, 3, 3, 5, 5, 5, 11], labels=string.(1:7)) y = cut(1:10, [1, 3, 3, 3, 5, 5, 5, 11], allowempty=true, labels=string.(1:7)) @test y == recode(cut(1:10, [1, 3, 5, 11]), - "[1, 3)" => "1", "[3, 5)" => "4", "[5, 11)" => "7") + "[1, 3)" => "1", "[3, 5)" => "4", "[5, 11]" => "7") @test levels(y) == string.(1:7) @test_throws ArgumentError cut(1:10, [1, 3, 5, 5, 11], @@ -255,9 +265,9 @@ end end @testset "cut with extend=missing" begin - x = @inferred cut([-0.0, 0.0, 1.0, 2.0, 3.0, 4.0], [-0.0, 0.0, 3.0], + x = @inferred cut([-0.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0], [-0.0, 0.0, 3.0], labels=[-0.0, 0.0], extend=missing) - @test x ≅ [-0.0, 0.0, 0.0, 0.0, missing, missing] + @test x ≅ [-0.0, 0.0, 0.0, 0.0, 0.0, missing, missing] @test x isa CategoricalArray{Union{Missing, Float64},1,UInt32} @test isordered(x) @test levels(x) == [-0.0, 0.0] diff --git a/test/17_deprecated.jl b/test/17_deprecated.jl index bc492484..d5a08ff4 100644 --- a/test/17_deprecated.jl +++ b/test/17_deprecated.jl @@ -10,7 +10,7 @@ const ≅ = isequal @test x ≅ ["a", missing, missing] x = cut([1, missing, 100], [1, 2], allow_missing=true) - @test x ≅ ["[1, 2)", missing, missing] + @test x ≅ ["[1, 2]", missing, missing] end end \ No newline at end of file From 9eca0c765fdbc03944420eebeefb1d3f3e1a0e8a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sat, 28 Dec 2024 15:15:48 +0100 Subject: [PATCH 04/25] CompatHelper: add new compat entry for Statistics at version 1, (keep existing compat) (#411) Co-authored-by: CompatHelper Julia --- Project.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Project.toml b/Project.toml index 5846e340..6b388bcb 100644 --- a/Project.toml +++ b/Project.toml @@ -31,6 +31,7 @@ Missings = "0.4.3, 1" RecipesBase = "1.1" Requires = "1" SentinelArrays = "1" +Statistics = "1" StructTypes = "1" julia = "1" @@ -46,5 +47,4 @@ StructTypes = "856f2bd8-1eba-4b0a-8007-ebc267875bd4" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Dates", "JSON", "JSON3", "Plots", "PooledArrays", - "RecipesBase", "SentinelArrays", "StructTypes", "Test"] +test = ["Dates", "JSON", "JSON3", "Plots", "PooledArrays", "RecipesBase", "SentinelArrays", "StructTypes", "Test"] From 9f9acba2ebbb7aeb2c4f080b51c7bdc97b9ebcb9 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sat, 28 Dec 2024 15:23:39 +0100 Subject: [PATCH 05/25] Update CI --- .github/workflows/ci.yml | 10 +++++----- docs/Project.toml | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index aaeda107..06fddf55 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,12 +26,12 @@ jobs: - os: macOS-latest arch: x86 steps: - - uses: actions/checkout@v2 - - uses: julia-actions/setup-julia@v1 + - uses: actions/checkout@v4 + - uses: julia-actions/setup-julia@v2 with: version: ${{ matrix.version }} arch: ${{ matrix.arch }} - - uses: actions/cache@v1 + - uses: actions/cache@v2 env: cache-name: cache-artifacts with: @@ -44,14 +44,14 @@ jobs: - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 - uses: julia-actions/julia-processcoverage@v1 - - uses: codecov/codecov-action@v1 + - uses: codecov/codecov-action@v5 with: file: lcov.info docs: name: Documentation runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - uses: julia-actions/julia-buildpkg@latest - uses: julia-actions/julia-docdeploy@latest env: diff --git a/docs/Project.toml b/docs/Project.toml index 1a6d3094..1814eb33 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -2,4 +2,4 @@ Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" [compat] -Documenter = "~0.27" +Documenter = "1" From 55f000a2d38f4c8c593adf0fa52f9158f379cb7d Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sat, 28 Dec 2024 15:44:02 +0100 Subject: [PATCH 06/25] Enable Dependabot --- .github/dependabot.yml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .github/dependabot.yml diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 00000000..adee0ed1 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,6 @@ +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "monthly" \ No newline at end of file From 4434fe429e6823e5049093b714bcba00f4f5794a Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sat, 28 Dec 2024 15:52:12 +0100 Subject: [PATCH 07/25] Fix Documenter --- docs/make.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/make.jl b/docs/make.jl index 6a5e3be8..1b260579 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -15,8 +15,7 @@ makedocs( "Implementation details" => "implementation.md", "API index" => "apiindex.md" ], - checkdocs = :exports, - strict=true + checkdocs = :exports ) deploydocs( From 341de709cfbd4586f8198daa382463654d538fe0 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Mon, 30 Dec 2024 22:47:04 +0100 Subject: [PATCH 08/25] Fix corner cases of cut (#410) Apply more systematically the rule that all intervals are closed on the left and open on the right except the last one. Throw an error when duplicated breaks this would lead to empty intervals unless `allowempty=true`. Improve handling of -0.0, NaN and Inf. --- src/extras.jl | 66 ++++++++++++++++++++-------------- test/15_extras.jl | 91 +++++++++++++++++++++++++++++++++++++++++++++-- test/runtests.jl | 2 ++ 3 files changed, 130 insertions(+), 29 deletions(-) diff --git a/src/extras.jl b/src/extras.jl index f536f06f..2afcef38 100644 --- a/src/extras.jl +++ b/src/extras.jl @@ -9,11 +9,14 @@ function fill_refs!(refs::AbstractArray, X::AbstractArray, @inbounds for i in eachindex(X) x = X[i] - if ismissing(x) + if x isa Number && isnan(x) + throw(ArgumentError("NaN values are not allowed in input vector")) + elseif ismissing(x) refs[i] = 0 - elseif x == upper + elseif isequal(x, upper) refs[i] = n-1 - elseif extend !== true && !(lower <= x <= upper) + elseif extend !== true && + !((isless(lower, x) || isequal(x, lower)) && isless(x, upper)) extend === missing || throw(ArgumentError("value $x (at index $i) does not fall inside the breaks: " * "adapt them manually, or pass extend=true or extend=missing")) @@ -55,10 +58,10 @@ also accept them. the intervals; or a function `f(from, to, i; leftclosed, rightclosed)` that generates the labels from the left and right interval boundaries and the group index. Defaults to `"[from, to)"` (or `"[from, to]"` for the rightmost interval if `extend == true`). -* `allowempty::Bool=false`: when `false`, an error is raised if some breaks appear - multiple times, generating empty intervals; when `true`, duplicate breaks are allowed - and the intervals they generate are kept as unused levels - (but duplicate labels are not allowed). +* `allowempty::Bool=false`: when `false`, an error is raised if some breaks other than + the last one appear multiple times, generating empty intervals; when `true`, + duplicate breaks are allowed and the intervals they generate are kept as + unused levels (but duplicate labels are not allowed). # Examples ```jldoctest @@ -132,14 +135,19 @@ function _cut(x::AbstractArray{T, N}, breaks::AbstractVector, extend::Union{Bool, Missing}, labels::Union{AbstractVector{<:SupportedTypes},Function}, allowempty::Bool=false) where {T, N} - if !allowempty && !allunique(breaks) - throw(ArgumentError("all breaks must be unique unless `allowempty=true`")) - end - if !issorted(breaks) breaks = sort(breaks) end + if any(x -> x isa Number && isnan(x), breaks) + throw(ArgumentError("NaN values are not allowed in breaks")) + end + + if !allowempty && !allunique(@view breaks[1:end-1]) + throw(ArgumentError("all breaks other than the last one must be unique " * + "unless `allowempty=true`")) + end + if extend === true xnm = T >: Missing ? skipmissing(x) : x length(breaks) >= 1 || throw(ArgumentError("at least one break must be provided")) @@ -158,11 +166,11 @@ function _cut(x::AbstractArray{T, N}, breaks::AbstractVector, rethrow(err) end end - if !ismissing(min_x) && breaks[1] > min_x + if !ismissing(min_x) && isless(min_x, breaks[1]) # this type annotation is needed on Julia<1.7 for stable inference breaks = [min_x::nonmissingtype(eltype(x)); breaks] end - if !ismissing(max_x) && breaks[end] < max_x + if !ismissing(max_x) && isless(breaks[end], max_x) breaks = [breaks; max_x::nonmissingtype(eltype(x))] end length(breaks) > 1 || @@ -189,16 +197,15 @@ function _cut(x::AbstractArray{T, N}, breaks::AbstractVector, from = breaks[1:n-1] to = breaks[2:n] firstlevel = labels(from[1], to[1], 1, - leftclosed=breaks[1] != breaks[2], rightclosed=false) + leftclosed=!isequal(breaks[1], breaks[2]), rightclosed=false) levs = Vector{typeof(firstlevel)}(undef, n-1) levs[1] = firstlevel for i in 2:n-2 levs[i] = labels(from[i], to[i], i, - leftclosed=breaks[i] != breaks[i+1], rightclosed=false) + leftclosed=!isequal(breaks[i], breaks[i+1]), rightclosed=false) end levs[end] = labels(from[end], to[end], n-1, - leftclosed=breaks[end-1] != breaks[end], - rightclosed=true) + leftclosed=true, rightclosed=true) else length(labels) == n-1 || throw(ArgumentError("labels must be of length $(n-1), but got length $(length(labels))")) @@ -243,21 +250,28 @@ quantiles. the labels from the left and right interval boundaries and the group index. Defaults to `"Qi: [from, to)"` (or `"Qi: [from, to]"` for the rightmost interval). * `allowempty::Bool=false`: when `false`, an error is raised if some quantiles breakpoints - are equal, generating empty intervals; when `true`, duplicate breaks are allowed - and the intervals they generate are kept as unused levels - (but duplicate labels are not allowed). + other than the last one are equal, generating empty intervals; + when `true`, duplicate breaks are allowed and the intervals they generate are kept as + unused levels (but duplicate labels are not allowed). """ function cut(x::AbstractArray, ngroups::Integer; labels::Union{AbstractVector{<:SupportedTypes},Function}=quantile_formatter, allowempty::Bool=false) + ngroups >= 1 || throw(ArgumentError("ngroups must be strictly positive (got $ngroups)")) xnm = eltype(x) >: Missing ? skipmissing(x) : x - breaks = Statistics.quantile(xnm, (1:ngroups-1)/ngroups) - if !allowempty && !allunique(breaks) - n = length(unique(breaks)) - 1 - throw(ArgumentError("cannot compute $ngroups quantiles: `quantile` " * - "returned only $n groups due to duplicated values in `x`." * + # Computing extrema is faster than taking 0 and 1 quantiles + min_x, max_x = extrema(xnm) + if (min_x isa Number && isnan(min_x)) || + (max_x isa Number && isnan(max_x)) + throw(ArgumentError("NaN values are not allowed in input vector")) + end + breaks = quantile(xnm, (1:ngroups-1)/ngroups) + breaks = [min_x; breaks; max_x] + if !allowempty && !allunique(@view breaks[1:end-1]) + throw(ArgumentError("cannot compute $ngroups quantiles due to " * + "too many duplicated values in `x`. " * "Pass `allowempty=true` to allow empty quantiles or " * "choose a lower value for `ngroups`.")) end - cut(x, breaks; extend=true, labels=labels, allowempty=allowempty) + cut(x, breaks; labels=labels, allowempty=allowempty) end diff --git a/test/15_extras.jl b/test/15_extras.jl index 14fb4352..1aaf8dc7 100644 --- a/test/15_extras.jl +++ b/test/15_extras.jl @@ -111,9 +111,6 @@ const ≅ = isequal @test isa(x, CategoricalVector{Union{Int, String, T}}) @test isordered(x) @test levels(x) == [0, "2", 4, "6", 8] - - @test_throws ArgumentError cut([-0.0, 0.0], 2) - @test_throws ArgumentError cut([-0.0, 0.0], 2, labels=[-0.0, 0.0]) end @testset "cut with missing values in input" begin @@ -144,6 +141,11 @@ end @test levels(x) == ["Q1: [2.0, 3.5)", "Q2: [3.5, 5.0]"] end +@testset "cut(x, n) with invalid n" begin + @test_throws ArgumentError cut(1:10, 0) + @test_throws ArgumentError cut(1:10, -1) +end + @testset "cut with formatter function" begin my_formatter(from, to, i; leftclosed, rightclosed) = "$i: $from -- $to" @@ -185,11 +187,20 @@ end x = [zeros(10); ones(10)] @test_throws ArgumentError cut(x, [0, 0.1, 0.1, 10]) @test_throws ArgumentError cut(x, 10) + y = cut(x, [0, 0.1, 10, 10]) + @test y == [fill("[0.0, 0.1)", 10); fill("[0.1, 10.0)", 10)] + @test levels(y) == ["[0.0, 0.1)", "[0.1, 10.0)", "[10.0, 10.0]"] @test_throws ArgumentError cut(1:10, [1, 5, 5, 11]) y = cut(1:10, [1, 5, 5, 11], allowempty=true) @test y == cut(1:10, [1, 5, 11]) @test levels(y) == ["[1, 5)", "(5, 5)", "[5, 11]"] + y = cut(1:10, [1, 5, 11, 11]) + @test y == [fill("[1, 5)", 4); fill("[5, 11)", 6)] + @test levels(y) == ["[1, 5)", "[5, 11)", "[11, 11]"] + y = cut(1:10, [1, 5, 10, 10]) + @test y == [fill("[1, 5)", 4); fill("[5, 10)", 5); "[10, 10]"] + @test levels(y) == ["[1, 5)", "[5, 10)", "[10, 10]"] @test_throws ArgumentError cut(1:10, [1, 5, 5, 5, 11]) @test_throws ArgumentError cut(1:10, [1, 5, 5, 11], @@ -242,6 +253,49 @@ end fmt = (from, to, i; leftclosed, rightclosed) -> (i % 2 == 0 ? to : 0.0) @test_throws ArgumentError cut(1:8, 0:2:10, labels=fmt) + + @test_throws ArgumentError cut([fill(1, 10); 4], 2) + @test_throws ArgumentError cut([fill(1, 10); 4], 3) + x = cut([fill(1, 10); 4], 2, allowempty=true) + @test unique(x) == ["Q2: [1.0, 4.0]"] + x = cut([fill(1, 10); 4], 3, allowempty=true) + @test unique(x) == ["Q3: [1.0, 4.0]"] + @test levels(x) == ["Q1: (1.0, 1.0)", "Q2: (1.0, 1.0)", "Q3: [1.0, 4.0]"] + + x = cut([fill(1, 5); fill(4, 5)], 2) + @test x == [fill("Q1: [1.0, 2.5)", 5); fill("Q2: [2.5, 4.0]", 5)] + @test levels(x) == ["Q1: [1.0, 2.5)", "Q2: [2.5, 4.0]"] + @test_throws ArgumentError cut([fill(1, 5); fill(4, 5)], 3) + x = cut([fill(1, 5); fill(4, 5)], 3, allowempty=true) + @test x == [fill("Q2: [1.0, 4.0)", 5); fill("Q3: [4.0, 4.0]", 5)] + @test levels(x) == ["Q1: (1.0, 1.0)", "Q2: [1.0, 4.0)", "Q3: [4.0, 4.0]"] +end + +@testset "cut with -0.0" begin + x = cut([-0.0, 0.0, 0.0, -0.0], 2) + @test x == ["Q1: [-0.0, 0.0)", "Q2: [0.0, 0.0]", "Q2: [0.0, 0.0]", "Q1: [-0.0, 0.0)"] + @test levels(x) == ["Q1: [-0.0, 0.0)", "Q2: [0.0, 0.0]"] + + x = cut([-0.0, 0.0, 0.0, -0.0], [-0.0, 0.0, 0.0]) + @test x == ["[-0.0, 0.0)", "[0.0, 0.0]", "[0.0, 0.0]", "[-0.0, 0.0)"] + @test levels(x) == ["[-0.0, 0.0)", "[0.0, 0.0]"] + + x = cut([-0.0, 0.0, 0.0, -0.0], [-0.0, 0.0]) + @test x == fill("[-0.0, 0.0]", 4) + @test levels(x) == ["[-0.0, 0.0]"] + + x = cut([-0.0, 0.0, 0.0, -0.0], [0.0], extend=true) + @test x == fill("[-0.0, 0.0]", 4) + @test levels(x) == ["[-0.0, 0.0]"] + + x = cut([-0.0, 0.0, 0.0, -0.0], [-0.0], extend=true) + @test x == fill("[-0.0, 0.0]", 4) + @test levels(x) == ["[-0.0, 0.0]"] + + x = cut([-0.0, 0.0, 0.0, -0.0], 2, labels=[-0.0, 0.0]) + @test x == [-0.0, 0.0, 0.0, -0.0] + + @test_throws ArgumentError cut([-0.0, 0.0, 0.0, -0.0], [-0.0, -0.0, 0.0]) end @testset "cut with extend=true" begin @@ -276,4 +330,35 @@ end @test x == ["[-1.0, 0.0)", "[-1.0, 0.0)", "[0.0, 1.0]", "[0.0, 1.0]", "[0.0, 1.0]"] end +@testset "cut with NaN and Inf" begin + @test_throws ArgumentError("NaN values are not allowed in input vector") cut([1, NaN, 2, 3], [1, 10]) + @test_throws ArgumentError("NaN values are not allowed in input vector") cut([1, NaN, 2, 3], [1], extend=true) + @test_throws ArgumentError("NaN values are not allowed in input vector") cut([1, NaN, 2, 3], 2) + @test_throws ArgumentError("NaN values are not allowed in breaks") cut([1, 2], [1, NaN]) + + x = cut([1, Inf], [1], extend=true) + @test x ≅ ["[1.0, Inf]", "[1.0, Inf]"] + @test levels(x) == ["[1.0, Inf]"] + + x = cut([1, -Inf], [1], extend=true) + @test x ≅ ["[-Inf, 1.0]", "[-Inf, 1.0]"] + @test levels(x) == ["[-Inf, 1.0]"] + + x = cut([1:5; Inf], [1, 2, Inf]) + @test x ≅ ["[1.0, 2.0)"; fill("[2.0, Inf]", 5)] + @test levels(x) == ["[1.0, 2.0)", "[2.0, Inf]"] + + x = cut([1:5; -Inf], [-Inf, 2, 5]) + @test x ≅ ["[-Inf, 2.0)"; fill("[2.0, 5.0]", 4); "[-Inf, 2.0)"] + @test levels(x) == ["[-Inf, 2.0)", "[2.0, 5.0]"] + + x = cut([1:5; Inf], 2) + @test x ≅ [fill("Q1: [1.0, 3.5)", 3); fill("Q2: [3.5, Inf]", 3)] + @test levels(x) == ["Q1: [1.0, 3.5)", "Q2: [3.5, Inf]"] + + x = cut([1:5; -Inf], 2) + @test x ≅ [fill("Q1: [-Inf, 2.5)", 2); fill("Q2: [2.5, 5.0]", 3); "Q1: [-Inf, 2.5)"] + @test levels(x) == ["Q1: [-Inf, 2.5)", "Q2: [2.5, 5.0]"] end + +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 142bd15f..e59180e7 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -10,6 +10,8 @@ module TestCategoricalArrays using Test using CategoricalArrays + const ≊ = isequal + tests = [ "01_value.jl", "04_constructors.jl", From 3e0d05653ed9b92fefecdc059f9358f8601a1af6 Mon Sep 17 00:00:00 2001 From: Tiem van der Deure Date: Fri, 3 Jan 2025 23:38:10 +0100 Subject: [PATCH 09/25] Make recode! type stable (#407) Varargs appear to be type-stable according to `@code_warntype` but in practice that's not the case. --- Project.toml | 2 + src/CategoricalArrays.jl | 1 + src/recode.jl | 84 +++++++++++++++++++--------------------- 3 files changed, 43 insertions(+), 44 deletions(-) diff --git a/Project.toml b/Project.toml index 6b388bcb..4593b00b 100644 --- a/Project.toml +++ b/Project.toml @@ -3,6 +3,7 @@ uuid = "324d7699-5711-5eae-9e2f-1d82baa6b597" version = "0.10.8" [deps] +Compat = "34da2185-b29b-5c13-b0c7-acf172513d20" DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" Future = "9fa8497b-333b-5362-9e8d-4d0656e87820" Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" @@ -24,6 +25,7 @@ CategoricalArraysSentinelArraysExt = "SentinelArrays" CategoricalArraysStructTypesExt = "StructTypes" [compat] +Compat = "3.37, 4" DataAPI = "1.6" JSON = "0.15, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21" JSON3 = "1.1.2" diff --git a/src/CategoricalArrays.jl b/src/CategoricalArrays.jl index 214a5d17..a28cba94 100644 --- a/src/CategoricalArrays.jl +++ b/src/CategoricalArrays.jl @@ -14,6 +14,7 @@ module CategoricalArrays using DataAPI using Missings using Printf + import Compat # JuliaLang/julia#36810 if VERSION < v"1.5.2" diff --git a/src/recode.jl b/src/recode.jl index 282d4fb6..141f9967 100644 --- a/src/recode.jl +++ b/src/recode.jl @@ -52,27 +52,34 @@ A user defined type could override this method to define an appropriate test fun optimize_pair(pair::Pair) = pair optimize_pair(pair::Pair{<:AbstractArray}) = Set(pair.first) => pair.second -function recode!(dest::AbstractArray{T}, src::AbstractArray, default::Any, pairs::Pair...) where {T} +function recode!(dest::AbstractArray, src::AbstractArray, default::Any, pairs::Pair...) if length(dest) != length(src) throw(DimensionMismatch("dest and src must be of the same length (got $(length(dest)) and $(length(src)))")) end - opt_pairs = map(optimize_pair, pairs) + opt_pairs = optimize_pair.(pairs) + _recode!(dest, src, default, opt_pairs) +end + +function _recode!(dest::AbstractArray{T}, src::AbstractArray, default, + pairs::NTuple{<:Any, Pair}) where {T} + recode_to = last.(pairs) + recode_from = first.(pairs) + @inbounds for i in eachindex(dest, src) x = src[i] - for j in 1:length(opt_pairs) - p = opt_pairs[j] - # we use isequal and recode_in because we cannot really distinguish scalars from collections - if x ≅ p.first || recode_in(x, p.first) - dest[i] = p.second - @goto nextitem - end - end - + # @inline is needed for type stability and Compat for compatibility before julia v1.8 + # we use isequal and recode_in because we cannot really + # distinguish scalars from collections + j = Compat.@inline findfirst(y -> isequal(x, y) || recode_in(x,y), recode_from) + + # Value in one of the pairs + if j !== nothing + dest[i] = recode_to[j] # Value not in any of the pairs - if ismissing(x) + elseif ismissing(x) eltype(dest) >: Missing || throw(MissingException("missing value found, but dest does not support them: " * "recode them to a supported value")) @@ -89,21 +96,16 @@ function recode!(dest::AbstractArray{T}, src::AbstractArray, default::Any, pairs else dest[i] = default end - - @label nextitem end dest end -function recode!(dest::CategoricalArray{T}, src::AbstractArray, default::Any, pairs::Pair...) where {T} - if length(dest) != length(src) - throw(DimensionMismatch("dest and src must be of the same length (got $(length(dest)) and $(length(src)))")) - end - - opt_pairs = map(optimize_pair, pairs) +function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, default::Any, + pairs::NTuple{<:Any, Pair}) where {T, R} + recode_from = first.(pairs) + vals = T[p.second for p in pairs] - vals = T[p.second for p in opt_pairs] default !== nothing && push!(vals, default) levels!(dest.pool, filter!(!ismissing, unique(vals))) @@ -112,22 +114,22 @@ function recode!(dest::CategoricalArray{T}, src::AbstractArray, default::Any, pa dupvals = length(vals) != length(levels(dest.pool)) drefs = dest.refs - pairmap = [ismissing(v) ? 0 : get(dest.pool, v) for v in vals] - defaultref = default === nothing || ismissing(default) ? 0 : get(dest.pool, default) + pairmap = [ismissing(v) ? zero(R) : get(dest.pool, v) for v in vals] + defaultref = default === nothing || ismissing(default) ? zero(R) : get(dest.pool, default) + @inbounds for i in eachindex(drefs, src) x = src[i] - for j in 1:length(opt_pairs) - p = opt_pairs[j] - # we use isequal and recode_in because we cannot really distinguish scalars from collections - if x ≅ p.first || recode_in(x, p.first) - drefs[i] = dupvals ? pairmap[j] : j - @goto nextitem - end - end + # @inline is needed for type stability and Compat for compatibility before julia v1.8 + # we use isequal and recode_in because we cannot really + # distinguish scalars from collections + j = Compat.@inline findfirst(y -> isequal(x, y) || recode_in(x, y), recode_from) + # Value in one of the pairs + if j !== nothing + drefs[i] = dupvals ? pairmap[j] : j # Value not in any of the pairs - if ismissing(x) + elseif ismissing(x) eltype(dest) >: Missing || throw(MissingException("missing value found, but dest does not support them: " * "recode them to a supported value")) @@ -144,8 +146,6 @@ function recode!(dest::CategoricalArray{T}, src::AbstractArray, default::Any, pa else drefs[i] = defaultref end - - @label nextitem end # Put existing levels first, and sort them if possible @@ -168,25 +168,21 @@ function recode!(dest::CategoricalArray{T}, src::AbstractArray, default::Any, pa dest end -function recode!(dest::CategoricalArray{T, N, R}, src::CategoricalArray, - default::Any, pairs::Pair...) where {T, N, R<:Integer} - if length(dest) != length(src) - throw(DimensionMismatch("dest and src must be of the same length " * - "(got $(length(dest)) and $(length(src)))")) - end - +function _recode!(dest::CategoricalArray{T, N, R}, src::CategoricalArray, + default::Any, pairs::NTuple{<:Any, Pair}) where {T, N, R<:Integer} + recode_from = first.(pairs) vals = T[p.second for p in pairs] + if default === nothing srclevels = levels(src) # Remove recoded levels as they won't appear in result - firsts = (p.first for p in pairs) keptlevels = Vector{T}(undef, 0) sizehint!(keptlevels, length(srclevels)) for l in srclevels - if !(any(x -> x ≅ l, firsts) || - any(f -> recode_in(l, f), firsts)) + if !(any(x -> x ≅ l, recode_from) || + any(f -> recode_in(l, f), recode_from)) try push!(keptlevels, l) catch err From 2d16eafeb8a8117c63a1ae590bb0a2c726dbaf53 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 22 Feb 2025 11:18:37 +0100 Subject: [PATCH 10/25] Bump actions/cache from 2 to 4 (#414) Bumps [actions/cache](https://github.com/actions/cache) from 2 to 4. - [Release notes](https://github.com/actions/cache/releases) - [Changelog](https://github.com/actions/cache/blob/main/RELEASES.md) - [Commits](https://github.com/actions/cache/compare/v2...v4) --- updated-dependencies: - dependency-name: actions/cache dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 06fddf55..1fb7fb41 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -31,7 +31,7 @@ jobs: with: version: ${{ matrix.version }} arch: ${{ matrix.arch }} - - uses: actions/cache@v2 + - uses: actions/cache@v4 env: cache-name: cache-artifacts with: From d0f708104e110b742298ef6df6cc828b17ae014e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 22 Feb 2025 11:19:37 +0100 Subject: [PATCH 11/25] Bump julia-actions/setup-julia from 1 to 2 (#413) Bumps [julia-actions/setup-julia](https://github.com/julia-actions/setup-julia) from 1 to 2. - [Release notes](https://github.com/julia-actions/setup-julia/releases) - [Changelog](https://github.com/julia-actions/setup-julia/blob/master/devdocs/making_a_new_release.md) - [Commits](https://github.com/julia-actions/setup-julia/compare/v1...v2) --- updated-dependencies: - dependency-name: julia-actions/setup-julia dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/CompatHelper.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml index 8d889a9d..e628f26d 100644 --- a/.github/workflows/CompatHelper.yml +++ b/.github/workflows/CompatHelper.yml @@ -15,7 +15,7 @@ jobs: run: which julia continue-on-error: true - name: Install Julia, but only if it is not already available in the PATH - uses: julia-actions/setup-julia@v1 + uses: julia-actions/setup-julia@v2 with: version: '1' arch: ${{ runner.arch }} From f313cb09505840b70eae791e62912d3bb4885941 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Tue, 29 Apr 2025 11:53:30 +0200 Subject: [PATCH 12/25] Fix allocations by dropping `CategoricalPool` type parameter (#418) Self-referential types generate allocations since Julia 1.11 (JuliaLang/julia#58169). This third parameter seems to have been unnecessary since `NominalValue` and `OrdinalValue` got merged into a single `CategoricalValue` type. --- src/array.jl | 3 +-- src/pool.jl | 14 ++++++-------- src/typedefs.jl | 39 ++++++++++++++++----------------------- test/04_constructors.jl | 22 +++++----------------- 4 files changed, 28 insertions(+), 50 deletions(-) diff --git a/src/array.jl b/src/array.jl index ffbf66b8..6950d1fa 100644 --- a/src/array.jl +++ b/src/array.jl @@ -160,9 +160,8 @@ function CategoricalArray{T, N, R}(::UndefInitializer, dims::NTuple{N,Int}; U = leveltype(nonmissingtype(T)) S = T >: Missing ? Union{U, Missing} : U check_supported_eltype(S, T) - V = CategoricalValue{U, R} levs = levels === nothing ? U[] : collect(U, levels) - CategoricalArray{S, N}(zeros(R, dims), CategoricalPool{U, R, V}(levs, ordered)) + CategoricalArray{S, N}(zeros(R, dims), CategoricalPool{U, R}(levs, ordered)) end CategoricalArray{T, N}(::UndefInitializer, dims::NTuple{N,Int}; diff --git a/src/pool.jl b/src/pool.jl index 0ece21ce..9753a76d 100644 --- a/src/pool.jl +++ b/src/pool.jl @@ -2,20 +2,18 @@ const catpool_seed = UInt === UInt32 ? 0xe3cf1386 : 0x356f2c715023f1a5 hashlevels(levs::AbstractVector) = foldl((h, x) -> hash(x, h), levs, init=catpool_seed) -CategoricalPool{T, R, V}(ordered::Bool=false) where {T, R, V} = - CategoricalPool{T, R, V}(T[], ordered) CategoricalPool{T, R}(ordered::Bool=false) where {T, R} = CategoricalPool{T, R}(T[], ordered) CategoricalPool{T}(ordered::Bool=false) where {T} = CategoricalPool{T, DefaultRefType}(T[], ordered) CategoricalPool{T, R}(levels::AbstractVector, ordered::Bool=false) where {T, R} = - CategoricalPool{T, R, CategoricalValue{T, R}}(convert(Vector{T}, levels), ordered) + CategoricalPool{T, R}(convert(Vector{T}, levels), ordered) CategoricalPool(levels::AbstractVector{T}, ordered::Bool=false) where {T} = CategoricalPool{T, DefaultRefType}(convert(Vector{T}, levels), ordered) CategoricalPool(invindex::Dict{T, R}, ordered::Bool=false) where {T, R <: Integer} = - CategoricalPool{T, R, CategoricalValue{T, R}}(invindex, ordered) + CategoricalPool{T, R}(invindex, ordered) Base.convert(::Type{T}, pool::T) where {T <: CategoricalPool} = pool @@ -29,12 +27,12 @@ function Base.convert(::Type{CategoricalPool{T, R}}, pool::CategoricalPool) wher levelsT = convert(Vector{T}, pool.levels) invindexT = convert(Dict{T, R}, pool.invindex) - return CategoricalPool{T, R, CategoricalValue{T, R}}(levelsT, invindexT, pool.ordered) + return CategoricalPool{T, R}(levelsT, invindexT, pool.ordered) end -Base.copy(pool::CategoricalPool{T, R, V}) where {T, R, V} = - CategoricalPool{T, R, V}(copy(pool.levels), copy(pool.invindex), - pool.ordered, pool.hash) +Base.copy(pool::CategoricalPool{T, R}) where {T, R} = + CategoricalPool{T, R}(copy(pool.levels), copy(pool.invindex), + pool.ordered, pool.hash) function Base.show(io::IO, pool::CategoricalPool{T, R}) where {T, R} @static if VERSION >= v"1.6.0" diff --git a/src/typedefs.jl b/src/typedefs.jl index 973cbaf8..0f9aa414 100644 --- a/src/typedefs.jl +++ b/src/typedefs.jl @@ -6,8 +6,7 @@ const SupportedTypes = Union{AbstractString, AbstractChar, Number} # Type params: # * `T` type of categorized values # * `R` integer type for referencing category levels -# * `V` categorical value type -mutable struct CategoricalPool{T <: SupportedTypes, R <: Integer, V} +mutable struct CategoricalPool{T <: SupportedTypes, R <: Integer} levels::Vector{T} # category levels ordered by their reference codes invindex::Dict{T, R} # map from category levels to their reference codes ordered::Bool # whether levels can be compared using < @@ -15,8 +14,8 @@ mutable struct CategoricalPool{T <: SupportedTypes, R <: Integer, V} subsetof::Ptr{Nothing} # last seen strict superset pool equalto::Ptr{Nothing} # last seen equal pool - function CategoricalPool{T, R, V}(levels::Vector{T}, - ordered::Bool) where {T, R, V} + function CategoricalPool{T, R}(levels::Vector{T}, + ordered::Bool) where {T, R} if length(levels) > typemax(R) throw(LevelsException{T, R}(levels[Int(typemax(R))+1:end])) end @@ -24,10 +23,10 @@ mutable struct CategoricalPool{T <: SupportedTypes, R <: Integer, V} if length(invindex) != length(levels) throw(ArgumentError("Duplicate entries are not allowed in levels")) end - CategoricalPool{T, R, V}(levels, invindex, ordered) + CategoricalPool{T, R}(levels, invindex, ordered) end - function CategoricalPool{T, R, V}(invindex::Dict{T, R}, - ordered::Bool) where {T, R, V} + function CategoricalPool{T, R}(invindex::Dict{T, R}, + ordered::Bool) where {T, R} levels = Vector{T}(undef, length(invindex)) # If invindex contains non consecutive values, a BoundsError will be thrown try @@ -40,18 +39,12 @@ mutable struct CategoricalPool{T <: SupportedTypes, R <: Integer, V} if length(invindex) > typemax(R) throw(LevelsException{T, R}(levels[typemax(R)+1:end])) end - CategoricalPool{T, R, V}(levels, invindex, ordered) + CategoricalPool{T, R}(levels, invindex, ordered) end - function CategoricalPool{T, R, V}(levels::Vector{T}, - invindex::Dict{T, R}, - ordered::Bool, - hash::Union{UInt, Nothing}=nothing) where {T, R, V} - if !(V <: CategoricalValue) - throw(ArgumentError("Type $V is not a categorical value type")) - end - if V !== CategoricalValue{T, R} - throw(ArgumentError("V must be CategoricalValue{T, R}")) - end + function CategoricalPool{T, R}(levels::Vector{T}, + invindex::Dict{T, R}, + ordered::Bool, + hash::Union{UInt, Nothing}=nothing) where {T, R} pool = new(levels, invindex, ordered, hash, C_NULL, C_NULL) return pool end @@ -77,7 +70,7 @@ the order of the pool's [`levels`](@ref DataAPI.levels) is used rather than the ordering of values of type `T`. """ struct CategoricalValue{T <: SupportedTypes, R <: Integer} - pool::CategoricalPool{T, R, CategoricalValue{T, R}} + pool::CategoricalPool{T, R} ref::R end @@ -98,14 +91,14 @@ const AbstractCategoricalMatrix{T, R, V, C, U} = AbstractCategoricalArray{T, 2, mutable struct CategoricalArray{T, N, R <: Integer, V, C, U} <: AbstractCategoricalArray{T, N, R, V, C, U} refs::Array{R, N} - pool::CategoricalPool{V, R, C} + pool::CategoricalPool{V, R} function CategoricalArray{T, N}(refs::Array{R, N}, - pool::CategoricalPool{V, R, C}) where - {T, N, R <: Integer, V, C} + pool::CategoricalPool{V, R}) where + {T, N, R <: Integer, V} T === V || T == Union{V, Missing} || throw(ArgumentError("T ($T) must be equal to $V or Union{$V, Missing}")) U = T >: Missing ? Missing : Union{} - new{T, N, R, V, C, U}(refs, pool) + new{T, N, R, V, CategoricalValue{V, R}, U}(refs, pool) end end const CategoricalVector{T, R <: Integer, V, C, U} = CategoricalArray{T, 1, R, V, C, U} diff --git a/test/04_constructors.jl b/test/04_constructors.jl index 5b39f95e..2d4eb4b0 100644 --- a/test/04_constructors.jl +++ b/test/04_constructors.jl @@ -5,22 +5,10 @@ using CategoricalArrays: DefaultRefType @testset "Type parameter constraints" begin # cannot use categorical value as level type - @test_throws TypeError CategoricalPool{CategoricalValue{Int,UInt8}, UInt8, CategoricalValue{CategoricalValue{Int,UInt8},UInt8}}( + @test_throws TypeError CategoricalPool{CategoricalValue{Int,UInt8}, UInt8}( Dict{CategoricalValue{Int,UInt8}, UInt8}(), false) - @test_throws TypeError CategoricalPool{CategoricalValue{Int,UInt8}, UInt8, CategoricalValue{CategoricalValue{Int,UInt8},UInt8}}( + @test_throws TypeError CategoricalPool{CategoricalValue{Int,UInt8}, UInt8}( CategoricalValue{Int,UInt8}[], false) - # cannot use non-categorical value as categorical value type - @test_throws ArgumentError CategoricalPool{Int, UInt8, Int}(Int[], false) - @test_throws ArgumentError CategoricalPool{Int, UInt8, Int}(Dict{Int, UInt8}(), false) - # level type of the pool and categorical value must match - @test_throws ArgumentError CategoricalPool{Int, UInt8, CategoricalValue{String, UInt8}}(Int[], false) - @test_throws ArgumentError CategoricalPool{Int, UInt8, CategoricalValue{String, UInt8}}(Dict{Int, UInt8}(), false) - # reference type of the pool and categorical value must match - @test_throws ArgumentError CategoricalPool{Int, UInt8, CategoricalValue{Int, UInt16}}(Int[], false) - @test_throws ArgumentError CategoricalPool{Int, UInt8, CategoricalValue{Int, UInt16}}(Dict{Int, UInt8}(), false) - # correct types combination - @test CategoricalPool{Int, UInt8, CategoricalValue{Int, UInt8}}(Int[], false) isa CategoricalPool - @test CategoricalPool{Int, UInt8, CategoricalValue{Int, UInt8}}(Dict{Int, UInt8}(), false) isa CategoricalPool end @testset "empty CategoricalPool{String}" begin @@ -38,7 +26,7 @@ end @testset "empty CategoricalPool{Int}" begin pool = CategoricalPool{Int, UInt8}() - @test isa(pool, CategoricalPool{Int, UInt8, CategoricalValue{Int, UInt8}}) + @test isa(pool, CategoricalPool{Int, UInt8}) @test isa(pool.levels, Vector{Int}) @test length(pool.levels) == 0 @@ -50,7 +38,7 @@ end @testset "CategoricalPool{String, DefaultRefType}(a b c)" begin pool = CategoricalPool(["a", "b", "c"]) - @test isa(pool, CategoricalPool{String, UInt32, CategoricalValue{String, UInt32}}) + @test isa(pool, CategoricalPool{String, UInt32}) @test isa(pool.levels, Vector{String}) @test pool.levels == ["a", "b", "c"] @@ -156,7 +144,7 @@ end @testset "CategoricalPool{Float64, UInt8}()" begin pool = CategoricalPool{Float64, UInt8}([1.0, 2.0, 3.0]) - @test isa(pool, CategoricalPool{Float64, UInt8, CategoricalValue{Float64, UInt8}}) + @test isa(pool, CategoricalPool{Float64, UInt8}) @test CategoricalValue(pool, 1) isa CategoricalValue{Float64, UInt8} end From 8bfc64785bfbb6c6eba8aaac0506aab720b0d844 Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Tue, 29 Apr 2025 03:14:28 -0700 Subject: [PATCH 13/25] fix unique() behaviour, add unique!() (#358) so it conforms to the semantics of the Base.unique() This is a breaking change that requires a new minor release. --- src/array.jl | 47 +++++++++++++++++++++++------------------ src/subarray.jl | 7 ------ test/11_array.jl | 23 ++++++++++++++++++-- test/12_missingarray.jl | 12 +++++++++++ 4 files changed, 59 insertions(+), 30 deletions(-) diff --git a/src/array.jl b/src/array.jl index 6950d1fa..04c9ea56 100644 --- a/src/array.jl +++ b/src/array.jl @@ -1,7 +1,7 @@ ## Code for CategoricalArray import Base: Array, convert, collect, copy, getindex, setindex!, similar, size, - unique, vcat, in, summary, float, complex, copyto! + unique, unique!, vcat, in, summary, float, complex, copyto! # Used for keyword argument default value _isordered(x::AbstractCategoricalArray) = isordered(x) @@ -867,31 +867,36 @@ function levels!(A::CategoricalArray{T, N, R}, newlevels::AbstractVector; return A end -function _unique(::Type{S}, - refs::AbstractArray{T}, - pool::CategoricalPool) where {S, T<:Integer} - nlevels = length(levels(pool)) + 1 - order = fill(0, nlevels) # 0 indicates not seen - # If we don't track missings, short-circuit even if none has been seen - count = S >: Missing ? 0 : 1 - @inbounds for i in refs - if order[i + 1] == 0 - count += 1 - order[i + 1] = count - count == nlevels && break +# return unique refs (each value is unique) in the order of appearance in `refs` +# equivalent to fallback Base.unique() implementation, +# but short-circuits once references to all levels are encountered +function _uniquerefs(A::CatArrOrSub{T}) where T + arefs = refs(A) + res = similar(arefs, 0) + nlevels = length(levels(A)) + maxunique = nlevels + (T >: Missing ? 1 : 0) + seen = fill(false, nlevels + 1) # always +1 for 0 (missing ref) + @inbounds for ref in arefs + if !seen[ref + 1] + push!(res, ref) + seen[ref + 1] = true + (length(res) == maxunique) && break end end - S[i == 1 ? missing : levels(pool)[i - 1] for i in sortperm(order) if order[i] != 0] + return res end -""" - unique(A::CategoricalArray) +unique(A::CatArrOrSub{T}) where T = + CategoricalVector{T}(_uniquerefs(A), copy(pool(A))) -Return levels which appear in `A` in their order of appearance. -This function is significantly slower than [`levels`](@ref DataAPI.levels) -since it needs to check whether levels are used or not. -""" -unique(A::CategoricalArray{T}) where {T} = _unique(T, A.refs, A.pool) +function unique!(A::CategoricalVector) + urefs = _uniquerefs(A) + if length(urefs) != length(A) + resize!(A.refs, length(urefs)) + copyto!(A.refs, urefs) + end + return A +end """ droplevels!(A::CategoricalArray) diff --git a/src/subarray.jl b/src/subarray.jl index 3e5f3f39..d7bf72df 100644 --- a/src/subarray.jl +++ b/src/subarray.jl @@ -5,13 +5,6 @@ isordered(sa::SubArray{T,N,P}) where {T,N,P<:CategoricalArray} = isordered(paren levels!(sa::SubArray{T,N,P}, newlevels::Vector) where {T,N,P<:CategoricalArray} = levels!(parent(sa), newlevels) -function unique(sa::SubArray{T,N,P}) where {T,N,P<:CategoricalArray} - A = parent(sa) - refs = view(A.refs, sa.indices...) - S = eltype(P) >: Missing ? Union{eltype(levels(A.pool)), Missing} : eltype(levels(A.pool)) - _unique(S, refs, A.pool) -end - refs(A::SubArray{<:Any, <:Any, <:CategoricalArray}) = view(parent(A).refs, parentindices(A)...) diff --git a/test/11_array.jl b/test/11_array.jl index 1edd2fef..4ac27b1b 100644 --- a/test/11_array.jl +++ b/test/11_array.jl @@ -16,6 +16,7 @@ using CategoricalArrays: DefaultRefType, leveltype @test isordered(x) === ordered @test levels(x) == sort(unique(a)) @test unique(x) == unique(a) + @test typeof(unique(x)) === typeof(x) @test size(x) === (3,) @test length(x) === 3 @@ -272,6 +273,7 @@ using CategoricalArrays: DefaultRefType, leveltype @test x == collect(a) @test isordered(x) === ordered @test levels(x) == unique(x) == unique(a) + @test typeof(unique(x)) === typeof(x) @test size(x) === (4,) @test length(x) === 4 @test leveltype(x) === Float64 @@ -437,6 +439,7 @@ using CategoricalArrays: DefaultRefType, leveltype @test x[4] === CategoricalValue(x.pool, 4) @test levels(x) == unique(a) @test unique(x) == unique(collect(x)) + @test typeof(unique(x)) === typeof(x) x[1:2] .= -1 @test x[1] === CategoricalValue(x.pool, 5) @@ -473,6 +476,7 @@ using CategoricalArrays: DefaultRefType, leveltype @test x == a @test isordered(x) === ordered @test levels(x) == unique(x) == unique(a) + @test unique(x) isa CategoricalVector{String, R} @test size(x) === (2, 3) @test length(x) === 6 @@ -729,6 +733,7 @@ end @test levels!(x, ["Young", "Middle", "Old"]) === x @test levels(x) == ["Young", "Middle", "Old"] @test unique(x) == ["Old", "Young", "Middle"] + @test typeof(unique(x)) === typeof(x) @test levels!(x, ["Young", "Middle", "Old", "Unused"]) === x @test levels(x) == ["Young", "Middle", "Old", "Unused"] @test unique(x) == ["Old", "Young", "Middle"] @@ -736,20 +741,34 @@ end @test levels(x) == ["Unused1", "Young", "Middle", "Old", "Unused2"] @test unique(x) == ["Old", "Young", "Middle"] + y = copy(x) + @test unique!(y) === y + @test y == unique(x) + x = CategoricalArray(String[]) @test isa(levels(x), Vector{String}) && isempty(levels(x)) - @test isa(unique(x), Vector{String}) && isempty(unique(x)) + @test isa(unique(x), typeof(x)) && isempty(unique(x)) @test levels!(x, ["Young", "Middle", "Old"]) === x @test levels(x) == ["Young", "Middle", "Old"] - @test isa(unique(x), Vector{String}) && isempty(unique(x)) + @test isa(unique(x), typeof(x)) && isempty(unique(x)) + + y = copy(x) + @test unique!(y) === y + @test y == unique(x) # To test short-circuiting x = CategoricalArray(repeat(1:10, inner=10)) @test levels(x) == collect(1:10) @test unique(x) == collect(1:10) + @test unique(x) isa typeof(x) @test levels!(x, [19:-1:1; 20]) === x @test levels(x) == [19:-1:1; 20] @test unique(x) == collect(1:10) + @test unique(x) isa typeof(x) + + y = copy(x) + @test unique!(y) === y + @test y == 1:10 end end diff --git a/test/12_missingarray.jl b/test/12_missingarray.jl index fea335c2..a2204e40 100644 --- a/test/12_missingarray.jl +++ b/test/12_missingarray.jl @@ -19,9 +19,14 @@ const ≅ = isequal @test isordered(x) === ordered @test levels(x) == sort(unique(a)) @test unique(x) == unique(a) + @test typeof(unique(x)) === typeof(x) @test size(x) === (3,) @test length(x) === 3 + y = copy(x) + @test y === unique!(y) + @test y == unique(x) + @test convert(CategoricalArray, x) === x @test convert(CategoricalArray{Union{String, Missing}}, x) === x @test convert(CategoricalArray{Union{String, Missing}, 1}, x) === x @@ -296,6 +301,7 @@ const ≅ = isequal @test x ≅ a @test levels(x) == filter(x->!ismissing(x), unique(a)) @test unique(x) ≅ unique(a) + @test typeof(unique(x)) === typeof(x) @test size(x) === (3,) @test length(x) === 3 @@ -440,6 +446,7 @@ const ≅ = isequal @test x == collect(a) @test isordered(x) === ordered @test levels(x) == unique(x) == unique(a) + @test typeof(unique(x)) === typeof(x) @test size(x) === (4,) @test length(x) === 4 @test leveltype(x) === Float64 @@ -616,6 +623,7 @@ const ≅ = isequal @test x[4] === CategoricalValue(x.pool, 4) @test levels(x) == unique(a) @test unique(x) == unique(collect(x)) + @test typeof(unique(x)) === typeof(x) x[1:2] .= -1 @test x[1] === CategoricalValue(x.pool, 5) @@ -625,6 +633,7 @@ const ≅ = isequal @test isordered(x) === false @test levels(x) == vcat(unique(a), -1) @test unique(x) == unique(collect(x)) + @test typeof(unique(x)) === typeof(x) ordered!(x, ordered) @@ -656,6 +665,7 @@ const ≅ = isequal @test x == a @test isordered(x) === ordered @test levels(x) == unique(x) == unique(a) + @test unique(x) isa CategoricalVector{Union{String, Missing}, R} @test size(x) === (2, 3) @test length(x) === 6 @@ -816,6 +826,7 @@ const ≅ = isequal @test isordered(x) === ordered @test levels(x) == filter(x->!ismissing(x), unique(a)) @test unique(x) ≅ unique(a) + @test unique(x) isa CategoricalVector{Union{String, Missing}, R} @test size(x) === (2, 3) @test length(x) === 6 @@ -1137,6 +1148,7 @@ end x = CategoricalArray(["Old", "Young", "Middle", missing, "Young"]) @test levels(x) == ["Middle", "Old", "Young"] @test unique(x) ≅ ["Old", "Young", "Middle", missing] + @test typeof(unique(x)) === typeof(x) @test levels!(x, ["Young", "Middle", "Old"]) === x @test levels(x) == ["Young", "Middle", "Old"] @test unique(x) ≅ ["Old", "Young", "Middle", missing] From a7ccfd5ca6de9f1fa16a8b4f2fd272fbbc5f1418 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Thu, 1 May 2025 00:50:27 +0200 Subject: [PATCH 14/25] Support reading from and writing to Arrow files (#415) This requires overriding `Arrow.DictEncoding` so that an `Arrow.DictEncoded` with a `CategoricalArray` dictionary with one entry per level is created. This is the only way to ensure that indexing the Arrow column gives `CategoricalValue` objects. In practice such columns will most often be used after conversion to `CategoricalArray` via `copy`, `DataFrame`, etc. Apparently, pandas do not allow reading the resulting file if the array allows for missing values as it does not accept `missing` in the dictionary. Instead it would need missing entries to be coded via null indices, which is less efficient. Require Julia 1.6 as tests fail on older Julia versions. --- .github/workflows/ci.yml | 2 +- Project.toml | 8 +++- ext/CategoricalArraysArrowExt.jl | 70 ++++++++++++++++++++++++++++++++ src/CategoricalArrays.jl | 1 + test/13_arraycommon.jl | 53 ++++++++++++++++++++++++ 5 files changed, 131 insertions(+), 3 deletions(-) create mode 100644 ext/CategoricalArraysArrowExt.jl diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1fb7fb41..c59b0c53 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,7 +12,7 @@ jobs: fail-fast: false matrix: version: - - '1.0' + - '1.6' - '1' # automatically expands to the latest stable 1.x release of Julia - 'nightly' os: diff --git a/Project.toml b/Project.toml index 4593b00b..adbb8789 100644 --- a/Project.toml +++ b/Project.toml @@ -13,18 +13,21 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" [weakdeps] +Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01" SentinelArrays = "91c51154-3ec4-41a3-a24f-3f23e20d615c" StructTypes = "856f2bd8-1eba-4b0a-8007-ebc267875bd4" [extensions] +CategoricalArraysArrowExt = "Arrow" CategoricalArraysJSONExt = "JSON" CategoricalArraysRecipesBaseExt = "RecipesBase" CategoricalArraysSentinelArraysExt = "SentinelArrays" CategoricalArraysStructTypesExt = "StructTypes" [compat] +Arrow = "2" Compat = "3.37, 4" DataAPI = "1.6" JSON = "0.15, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21" @@ -35,9 +38,10 @@ Requires = "1" SentinelArrays = "1" Statistics = "1" StructTypes = "1" -julia = "1" +julia = "1.6" [extras] +Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" @@ -49,4 +53,4 @@ StructTypes = "856f2bd8-1eba-4b0a-8007-ebc267875bd4" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Dates", "JSON", "JSON3", "Plots", "PooledArrays", "RecipesBase", "SentinelArrays", "StructTypes", "Test"] +test = ["Arrow", "Dates", "JSON", "JSON3", "Plots", "PooledArrays", "RecipesBase", "SentinelArrays", "StructTypes", "Test"] diff --git a/ext/CategoricalArraysArrowExt.jl b/ext/CategoricalArraysArrowExt.jl new file mode 100644 index 00000000..3e764122 --- /dev/null +++ b/ext/CategoricalArraysArrowExt.jl @@ -0,0 +1,70 @@ +module CategoricalArraysArrowExt + +using CategoricalArrays +import Arrow +import Arrow: ArrowTypes + +const CATARRAY_ARROWNAME = Symbol("JuliaLang.CategoricalArrays.CategoricalArray") +ArrowTypes.arrowname(::Type{<:CategoricalValue}) = CATARRAY_ARROWNAME +ArrowTypes.arrowmetadata(::Type{CategoricalValue{T, R}}) where {T, R} = string(R) + +ArrowTypes.arrowname(::Type{Union{<:CategoricalValue, Missing}}) = CATARRAY_ARROWNAME +ArrowTypes.arrowmetadata(::Type{Union{CategoricalValue{T, R}, Missing}}) where {T, R} = + string(R) + +const REFTYPES = Dict(string(T) => T for T in (Int128, Int16, Int32, Int64, Int8, UInt128, + UInt16, UInt32, UInt64, UInt8)) +function ArrowTypes.JuliaType(::Val{CATARRAY_ARROWNAME}, + ::Type{S}, meta::String) where S + R = REFTYPES[meta] + return CategoricalValue{S, R} +end + +for (MV, MT) in ((:V, :T), (:(Union{V,Missing}), :(Union{T,Missing}))) + @eval begin + function Arrow.DictEncoding{$MV,S,A}(id, data::Arrow.List{U, O, B}, + isOrdered, metadata) where + {T, R, V<:CategoricalValue{T,R}, S, O, A, B, U} + newdata = Arrow.List{$MT,O,B}(data.arrow, data.validity, data.offsets, + data.data, data.ℓ, data.metadata) + levels = Missing <: $MT ? collect(skipmissing(newdata)) : newdata + catdata = CategoricalVector{$MT,R}(newdata, levels=levels) + return Arrow.DictEncoding{$MV,S,typeof(catdata)}(id, catdata, + isOrdered, metadata) + end + + function Arrow.DictEncoding{$MV,S,A}(id, data::Arrow.Primitive{U, B}, + isOrdered, metadata) where + {T, R, V<:CategoricalValue{T,R}, S, A, B, U} + newdata = Arrow.Primitive{$MT,B}(data.arrow, data.validity, data.data, + data.ℓ, data.metadata) + levels = Missing <: $MT ? collect(skipmissing(newdata)) : newdata + catdata = CategoricalVector{$MT,R}(newdata, levels=levels) + return Arrow.DictEncoding{$MV,S,typeof(catdata)}(id, catdata, + isOrdered, metadata) + end + end +end + +function Base.copy(x::Arrow.DictEncoded{V}) where {T, R, V<:CategoricalValue{T, R}} + pool = CategoricalPool{T,R}(x.encoding.data) + inds = x.indices + refs = similar(inds, R) + refs .= inds .+ one(R) + return CategoricalVector{T}(refs, pool) +end + +function Base.copy(x::Arrow.DictEncoded{Union{Missing,V}}) where + {T, R, V<:CategoricalValue{T, R}} + ismissing(x.encoding.data[1]) || + throw(ErrorException("`missing` must be the first value in a " * + "`CategoricalArray` pool")) + levels = collect(skipmissing(x.encoding.data)) + pool = CategoricalPool{T,R}(levels) + inds = x.indices + refs = similar(inds, R) + refs .= inds + return CategoricalVector{Union{T,Missing}}(refs, pool) +end + +end diff --git a/src/CategoricalArrays.jl b/src/CategoricalArrays.jl index a28cba94..8f511677 100644 --- a/src/CategoricalArrays.jl +++ b/src/CategoricalArrays.jl @@ -41,6 +41,7 @@ module CategoricalArrays @static if !isdefined(Base, :get_extension) function __init__() + @require Arrow="69666777-d1a9-59fb-9406-91d4454c9d45" include("../ext/CategoricalArraysArrowExt.jl") @require JSON="682c06a0-de6a-54ab-a142-c8b1cf79cde6" include("../ext/CategoricalArraysJSONExt.jl") @require RecipesBase="3cdcf5f2-1ef4-517c-9805-6587b60abb01" include("../ext/CategoricalArraysRecipesBaseExt.jl") @require SentinelArrays="91c51154-3ec4-41a3-a24f-3f23e20d615c" include("../ext/CategoricalArraysSentinelArraysExt.jl") diff --git a/test/13_arraycommon.jl b/test/13_arraycommon.jl index 20d61ef0..4d7c5279 100644 --- a/test/13_arraycommon.jl +++ b/test/13_arraycommon.jl @@ -10,6 +10,8 @@ using StructTypes using RecipesBase using Plots using SentinelArrays +using Arrow +using Missings const ≅ = isequal const ≇ = !isequal @@ -2071,6 +2073,57 @@ StructTypes.StructType(::Type{<:MyCustomType}) = StructTypes.Struct() @test levels(readx.var) == levels(x.var) end +if Int == Int64 + @testset "writing and reading Arrow files" for f in (identity, passmissing(string)) + xref = f.([3, 1, 4, 1, 4]) + x = categorical(f.([3, 1, 4, 1, 4])) + tbl = mktemp() do path, io + Arrow.write(path, (x=x,)) + Arrow.Table(path) + end + @test tbl.x == x + @test tbl.x isa Arrow.DictEncoded{CategoricalValue{eltype(xref), UInt32}, Int8, + <: CategoricalVector{eltype(xref), UInt32}} + @test copy(tbl.x) == x + @test copy(x) isa CategoricalArray{eltype(xref),1,UInt32} + + x = categorical(f.([3, 1, 4, 1, 4]), compress=true) + tbl = mktemp() do path, io + Arrow.write(path, (x=x,)) + Arrow.Table(path) + end + @test tbl.x == x + @test tbl.x isa Arrow.DictEncoded{CategoricalValue{eltype(xref), UInt8}, Int8, + <: CategoricalVector{eltype(xref), UInt8}} + @test copy(tbl.x) == x + @test copy(x) isa CategoricalArray{eltype(xref),1,UInt8} + + x = categorical(recode(xref, 1 => missing)) + tbl = mktemp() do path, io + Arrow.write(path, (x=x,)) + Arrow.Table(path) + end + @test tbl.x ≅ x + @test tbl.x isa Arrow.DictEncoded{Union{CategoricalValue{eltype(xref), UInt32}, Missing}, + Int8, + <: CategoricalVector{Union{eltype(xref), Missing}, + UInt32}} + @test copy(tbl.x) ≅ x + @test copy(x) isa CategoricalArray{Union{eltype(xref), Missing},1,UInt32} + + recode!(x, missing => f(1)) + tbl = mktemp() do path, io + Arrow.write(path, (x=x,)) + Arrow.Table(path) + end + @test tbl.x == x + @test tbl.x isa Arrow.DictEncoded{Union{CategoricalValue{eltype(xref), UInt32}, Missing}, Int8, + <: CategoricalVector{Union{eltype(xref), Missing}, UInt32}} + @test copy(tbl.x) == x + @test copy(x) isa CategoricalArray{Union{eltype(xref), Missing},1,UInt32} + end +end + @testset "refarray, refvalue, refpool, and invrefpool" begin for y in (categorical(["b", "a", "c", "b"]), view(categorical(["a", "a", "c", "b"]), 1:3), From dc83e6af8e1e9f8063e63c7c347c247205c5e6b9 Mon Sep 17 00:00:00 2001 From: Andreas Noack Date: Sat, 17 May 2025 16:49:04 +0200 Subject: [PATCH 15/25] Test on min, lts, and pre instead of 1.6 and nightly --- .github/workflows/ci.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c59b0c53..00903e16 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,9 +12,10 @@ jobs: fail-fast: false matrix: version: - - '1.6' + - 'min' + - 'lts' - '1' # automatically expands to the latest stable 1.x release of Julia - - 'nightly' + - 'pre' os: - ubuntu-latest - macOS-latest From adbd741fa9c43344ac2154e95d2a0813bd093cd2 Mon Sep 17 00:00:00 2001 From: Andreas Noack Date: Sat, 17 May 2025 16:49:33 +0200 Subject: [PATCH 16/25] Only test on Linux The package doesn't have any binary dependendencies so there is not much value in testing on all platforms except sometimes detecting issues in test dependencies. --- .github/workflows/ci.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 00903e16..f7778286 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -18,8 +18,6 @@ jobs: - 'pre' os: - ubuntu-latest - - macOS-latest - - windows-latest arch: - x64 - x86 From 5d0f595289f9641c74ce3376973a55fcee5e82be Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sun, 18 May 2025 11:22:48 +0200 Subject: [PATCH 17/25] Use RecipesPipeline instead of Plots --- Project.toml | 4 ++-- test/13_arraycommon.jl | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Project.toml b/Project.toml index adbb8789..4de04411 100644 --- a/Project.toml +++ b/Project.toml @@ -45,12 +45,12 @@ Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" -Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01" +RecipesPipeline = "01d81517-befc-4cb6-b9ec-a95719d0359c" SentinelArrays = "91c51154-3ec4-41a3-a24f-3f23e20d615c" StructTypes = "856f2bd8-1eba-4b0a-8007-ebc267875bd4" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Arrow", "Dates", "JSON", "JSON3", "Plots", "PooledArrays", "RecipesBase", "SentinelArrays", "StructTypes", "Test"] +test = ["Arrow", "Dates", "JSON", "JSON3", "PooledArrays", "RecipesBase", "RecipesPipeline", "SentinelArrays", "StructTypes", "Test"] diff --git a/test/13_arraycommon.jl b/test/13_arraycommon.jl index 4d7c5279..2fb369c9 100644 --- a/test/13_arraycommon.jl +++ b/test/13_arraycommon.jl @@ -8,7 +8,7 @@ using PooledArrays using JSON3 using StructTypes using RecipesBase -using Plots +using RecipesPipeline using SentinelArrays using Arrow using Missings From e91470442d483f1ad1a2d2b988ec07341804b16e Mon Sep 17 00:00:00 2001 From: Andreas Noack Date: Sun, 18 May 2025 20:05:07 +0200 Subject: [PATCH 18/25] Continue with '1.6' instead of 'min' for now because of a method ambiguity in Arrow on 1.6.0. --- .github/workflows/ci.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f7778286..aee70898 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,7 +12,10 @@ jobs: fail-fast: false matrix: version: - - 'min' + # FIXME! Switch from 1.6 to 'min' once we require a higher minimum + # We can't switch yet as there is a method ambiguity for a depndency + # in version 1.6.0. + - '1.6' - 'lts' - '1' # automatically expands to the latest stable 1.x release of Julia - 'pre' From 07b955f6c47d98d132df707acc8662c1729549db Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Mon, 19 May 2025 09:44:12 +0200 Subject: [PATCH 19/25] Remove deprecations (#419) In preparation of 1.0. These have been in place for years, it's unlikely people rely on them ayway. --- src/CategoricalArrays.jl | 2 -- src/array.jl | 8 +------- src/deprecated.jl | 18 ------------------ src/extras.jl | 12 ------------ test/05_convert.jl | 6 +++--- test/11_array.jl | 2 +- test/13_arraycommon.jl | 2 +- test/17_deprecated.jl | 16 ---------------- test/runtests.jl | 3 +-- 9 files changed, 7 insertions(+), 62 deletions(-) delete mode 100644 src/deprecated.jl delete mode 100644 test/17_deprecated.jl diff --git a/src/CategoricalArrays.jl b/src/CategoricalArrays.jl index 8f511677..e597c344 100644 --- a/src/CategoricalArrays.jl +++ b/src/CategoricalArrays.jl @@ -33,8 +33,6 @@ module CategoricalArrays include("extras.jl") include("recode.jl") - include("deprecated.jl") - if !isdefined(Base, :get_extension) using Requires: @require end diff --git a/src/array.jl b/src/array.jl index 04c9ea56..c462e7d4 100644 --- a/src/array.jl +++ b/src/array.jl @@ -790,13 +790,7 @@ entries corresponding to omitted levels will be set to `missing`. Else, `newlevels` must include all levels which appear in the data. """ function levels!(A::CategoricalArray{T, N, R}, newlevels::AbstractVector; - allowmissing::Bool=false, - allow_missing::Union{Bool, Nothing}=nothing) where {T, N, R} - if allow_missing !== nothing - Base.depwarn("allow_missing argument is deprecated, use allowmissing instead", - :levels!) - allowmissing = allow_missing - end + allowmissing::Bool=false) where {T, N, R} (levels(A) == newlevels) && return A # nothing to do # map each new level to its ref code diff --git a/src/deprecated.jl b/src/deprecated.jl deleted file mode 100644 index 667b2923..00000000 --- a/src/deprecated.jl +++ /dev/null @@ -1,18 +0,0 @@ -function index(pool::CategoricalPool) - throw(ErrorException("CategoricalArrays.index(pool::CategoricalPool) is deprecated: " * - "use levels(pool) instead")) -end -function order(pool::CategoricalPool) - throw(ErrorException("CategoricalArrays.index(pool::CategoricalPool) is deprecated: " * - "use 1:length(levels(pool)) instead")) -end - -function categorical(A::AbstractArray, compress::Bool; kwargs...) - throw(ErrorException("categorical(A::AbstractArray, compress, kwargs...) is deprecated: " * - "use categorical(A, compress=compress, kwargs...) instead.")) -end - -import Base: get - -@deprecate get(x::CategoricalValue) DataAPI.unwrap(x) -@deprecate CategoricalValue(i::Integer, pool::CategoricalPool) pool[i] \ No newline at end of file diff --git a/src/extras.jl b/src/extras.jl index 2afcef38..b806a9f2 100644 --- a/src/extras.jl +++ b/src/extras.jl @@ -114,19 +114,7 @@ julia> cut(-1:0.5:1, 3, labels=fmt) @inline function cut(x::AbstractArray, breaks::AbstractVector; extend::Union{Bool, Missing}=false, labels::Union{AbstractVector{<:SupportedTypes},Function}=default_formatter, - allowmissing::Union{Bool, Nothing}=nothing, - allow_missing::Union{Bool, Nothing}=nothing, allowempty::Bool=false) - if allow_missing !== nothing - Base.depwarn("allow_missing argument is deprecated, use extend=missing instead", - :cut) - extend = missing - end - if allowmissing !== nothing - Base.depwarn("allowmissing argument is deprecated, use extend=missing instead", - :cut) - extend = missing - end return _cut(x, breaks, extend, labels, allowempty) end diff --git a/test/05_convert.jl b/test/05_convert.jl index b9b93544..3e7c98be 100644 --- a/test/05_convert.jl +++ b/test/05_convert.jl @@ -55,9 +55,9 @@ using CategoricalArrays: DefaultRefType, refcode, reftype, leveltype @test convert(Union{T, U}, v3)::T == v3 end - @test unwrap(v1) === get(v1) === 1 - @test unwrap(v2) === get(v2) === 2 - @test unwrap(v3) === get(v3) === 3 + @test unwrap(v1) === 1 + @test unwrap(v2) === 2 + @test unwrap(v3) === 3 @test promote(1, v1) === (1, 1) @test promote(1.0, v1) === (1.0, 1.0) diff --git a/test/11_array.jl b/test/11_array.jl index 4ac27b1b..b474cfe1 100644 --- a/test/11_array.jl +++ b/test/11_array.jl @@ -719,7 +719,7 @@ using CategoricalArrays: DefaultRefType, leveltype @test levels(x) == ["c", "a", "b"] ordered!(x, ordered) - v = CategoricalValue(2, CategoricalPool(["xyz", "b"])) + v = CategoricalValue(CategoricalPool(["xyz", "b"]), 2) x[1] = v @test x[1] === CategoricalValue(x.pool, 4) @test x[2] === CategoricalValue(x.pool, 1) diff --git a/test/13_arraycommon.jl b/test/13_arraycommon.jl index 2fb369c9..02b51bd7 100644 --- a/test/13_arraycommon.jl +++ b/test/13_arraycommon.jl @@ -893,7 +893,7 @@ end @test sort(cv, rev=rev, by=byf1) ≅ sort(cv, rev=rev, by=byf1) # Check that by function is not called on unused levels/missing - byf2 = x -> (@assert get(x) != "b"; x) + byf2 = x -> (@assert x != "b"; x) replace!(cv, missing=>"a", "b"=>"a") @test sort(cv, rev=rev, by=byf2) ≅ sort(cv, rev=rev, by=byf2) end diff --git a/test/17_deprecated.jl b/test/17_deprecated.jl deleted file mode 100644 index d5a08ff4..00000000 --- a/test/17_deprecated.jl +++ /dev/null @@ -1,16 +0,0 @@ -module TestExtras -using Test -using CategoricalArrays - -const ≅ = isequal - -@testset "allow_missing argument" begin - x = categorical(["a", "b", missing]) - levels!(x, ["a"], allow_missing=true) - @test x ≅ ["a", missing, missing] - - x = cut([1, missing, 100], [1, 2], allow_missing=true) - @test x ≅ ["[1, 2]", missing, missing] -end - -end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index e59180e7..088cfc9e 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -27,8 +27,7 @@ module TestCategoricalArrays "13_arraycommon.jl", "14_view.jl", "15_extras.jl", - "16_recode.jl", - "17_deprecated.jl" + "16_recode.jl" ] @testset "$test" for test in tests From b16588b9d392ca612b4c4b140acc5f54f7d4e479 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Wed, 21 May 2025 18:23:39 +0200 Subject: [PATCH 20/25] Choose different quantile cutpoints in `cut(x, n)` (#416) `Statistics.quantile` returns values which are not the most appropriate to generate labels. It is more intuitive to choose values from the actual data, which are likely to have fewer decimals and make more sense for users. Since intervals are closed on the left, we just have to use the value right below the quantile. This doesn't change group assignments (only labels). --- src/extras.jl | 55 ++++++++++++++++++++++++++++++++++------------- test/15_extras.jl | 50 ++++++++++++++++++++++++++++-------------- 2 files changed, 74 insertions(+), 31 deletions(-) diff --git a/src/extras.jl b/src/extras.jl index b806a9f2..3f27aba6 100644 --- a/src/extras.jl +++ b/src/extras.jl @@ -42,8 +42,8 @@ default_formatter(from, to, i; leftclosed, rightclosed) = Cut a numeric array into intervals at values `breaks` and return an ordered `CategoricalArray` indicating -the interval into which each entry falls. Intervals are of the form `[lower, upper)`, -i.e. the lower bound is included and the upper bound is excluded, except +the interval into which each entry falls. Intervals are of the form `[lower, upper)` +(closed on the left), i.e. the lower bound is included and the upper bound is excluded, except the last interval, which is closed on both ends, i.e. `[lower, upper]`. If `x` accepts missing values (i.e. `eltype(x) >: Missing`) the returned array will @@ -81,7 +81,7 @@ julia> cut(-1:0.5:1, 2) "Q1: [-1.0, 0.0)" "Q2: [0.0, 1.0]" "Q2: [0.0, 1.0]" - "Q2: [0.0, 1.0]" + "Q2: [0.0, 1.0]" julia> cut(-1:0.5:1, 2, labels=["A", "B"]) 5-element CategoricalArray{String,1,UInt32}: @@ -89,7 +89,7 @@ julia> cut(-1:0.5:1, 2, labels=["A", "B"]) "A" "B" "B" - "B" + "B" julia> cut(-1:0.5:1, 2, labels=[-0.5, +0.5]) 5-element CategoricalArray{Float64,1,UInt32}: @@ -104,11 +104,11 @@ fmt (generic function with 1 method) julia> cut(-1:0.5:1, 3, labels=fmt) 5-element CategoricalArray{String,1,UInt32}: - "grp 1 (-1.0//-0.3333333333333335)" - "grp 1 (-1.0//-0.3333333333333335)" - "grp 2 (-0.3333333333333335//0.33333333333333326)" - "grp 3 (0.33333333333333326//1.0)" - "grp 3 (0.33333333333333326//1.0)" + "grp 1 (-1.0//0.0)" + "grp 1 (-1.0//0.0)" + "grp 2 (0.0//0.5)" + "grp 3 (0.5//1.0)" + "grp 3 (0.5//1.0)" ``` """ @inline function cut(x::AbstractArray, breaks::AbstractVector; @@ -221,12 +221,38 @@ Provide the default label format for the `cut(x, ngroups)` method. quantile_formatter(from, to, i; leftclosed, rightclosed) = string("Q", i, ": ", leftclosed ? "[" : "(", from, ", ", to, rightclosed ? "]" : ")") +""" +Find first value in (sorted) `v` which is greater than or equal to each quantile +in (sorted) `qs`. +""" +function find_breaks(v::AbstractVector, qs::AbstractVector) + n = length(qs) + breaks = similar(v, n) + n == 0 && return breaks + + i = 1 + q = qs[1] + @inbounds for x in v + # Use isless and isequal to differentiate -0.0 from 0.0 + if isless(q, x) || isequal(q, x) + breaks[i] = x + i += 1 + i > n && break + q = qs[i] + end + end + return breaks +end + """ cut(x::AbstractArray, ngroups::Integer; labels::Union{AbstractVector{<:AbstractString},Function}, allowempty::Bool=false) -Cut a numeric array into `ngroups` quantiles, determined using `quantile`. +Cut a numeric array into `ngroups` quantiles. + +This is equivalent to `cut(x, quantile(x, (0:ngroups)/ngroups))`, +but breaks are taken from actual data values instead of estimated quantiles. If `x` contains `missing` values, they are automatically skipped when computing quantiles. @@ -246,15 +272,14 @@ function cut(x::AbstractArray, ngroups::Integer; labels::Union{AbstractVector{<:SupportedTypes},Function}=quantile_formatter, allowempty::Bool=false) ngroups >= 1 || throw(ArgumentError("ngroups must be strictly positive (got $ngroups)")) - xnm = eltype(x) >: Missing ? skipmissing(x) : x - # Computing extrema is faster than taking 0 and 1 quantiles - min_x, max_x = extrema(xnm) + sorted_x = eltype(x) >: Missing ? sort!(collect(skipmissing(x))) : sort(x) + min_x, max_x = first(sorted_x), last(sorted_x) if (min_x isa Number && isnan(min_x)) || (max_x isa Number && isnan(max_x)) throw(ArgumentError("NaN values are not allowed in input vector")) end - breaks = quantile(xnm, (1:ngroups-1)/ngroups) - breaks = [min_x; breaks; max_x] + qs = quantile!(sorted_x, (1:(ngroups-1))/ngroups, sorted=true) + breaks = [min_x; find_breaks(sorted_x, qs); max_x] if !allowempty && !allunique(@view breaks[1:end-1]) throw(ArgumentError("cannot compute $ngroups quantiles due to " * "too many duplicated values in `x`. " * diff --git a/test/15_extras.jl b/test/15_extras.jl index 1aaf8dc7..af4f79f5 100644 --- a/test/15_extras.jl +++ b/test/15_extras.jl @@ -127,18 +127,18 @@ end @testset "cut([5, 4, 3, 2], 2)" begin x = @inferred cut([5, 4, 3, 2], 2) - @test x == ["Q2: [3.5, 5.0]", "Q2: [3.5, 5.0]", "Q1: [2.0, 3.5)", "Q1: [2.0, 3.5)"] + @test x == ["Q2: [4, 5]", "Q2: [4, 5]", "Q1: [2, 4)", "Q1: [2, 4)"] @test isa(x, CategoricalArray) @test isordered(x) - @test levels(x) == ["Q1: [2.0, 3.5)", "Q2: [3.5, 5.0]"] + @test levels(x) == ["Q1: [2, 4)", "Q2: [4, 5]"] end @testset "cut(x, n) with missing values" begin x = @inferred cut([5, 4, 3, missing, 2], 2) - @test x ≅ ["Q2: [3.5, 5.0]", "Q2: [3.5, 5.0]", "Q1: [2.0, 3.5)", missing, "Q1: [2.0, 3.5)"] + @test x ≅ ["Q2: [4, 5]", "Q2: [4, 5]", "Q1: [2, 4)", missing, "Q1: [2, 4)"] @test isa(x, CategoricalArray) @test isordered(x) - @test levels(x) == ["Q1: [2.0, 3.5)", "Q2: [3.5, 5.0]"] + @test levels(x) == ["Q1: [2, 4)", "Q2: [4, 5]"] end @testset "cut(x, n) with invalid n" begin @@ -255,20 +255,29 @@ end @test_throws ArgumentError cut(1:8, 0:2:10, labels=fmt) @test_throws ArgumentError cut([fill(1, 10); 4], 2) - @test_throws ArgumentError cut([fill(1, 10); 4], 3) x = cut([fill(1, 10); 4], 2, allowempty=true) - @test unique(x) == ["Q2: [1.0, 4.0]"] + @test unique(x) == ["Q2: [1, 4]"] + @test levels(x) == ["Q1: (1, 1)", "Q2: [1, 4]"] + @test_throws ArgumentError cut([fill(1, 10); 4], 3) x = cut([fill(1, 10); 4], 3, allowempty=true) - @test unique(x) == ["Q3: [1.0, 4.0]"] - @test levels(x) == ["Q1: (1.0, 1.0)", "Q2: (1.0, 1.0)", "Q3: [1.0, 4.0]"] + @test unique(x) == ["Q3: [1, 4]"] + @test levels(x) == ["Q1: (1, 1)", "Q2: (1, 1)", "Q3: [1, 4]"] + + x = cut([fill(4, 10); 1], 2) + @test x == [fill("Q2: [4, 4]", 10); "Q1: [1, 4)"] + @test levels(x) == ["Q1: [1, 4)"; "Q2: [4, 4]"] + @test_throws ArgumentError cut([fill(4, 10); 1], 3) + x = cut([fill(4, 10); 1], 3, allowempty=true) + @test x == [fill("Q3: [4, 4]", 10); "Q1: [1, 4)"] + @test levels(x) == ["Q1: [1, 4)", "Q2: (4, 4)", "Q3: [4, 4]"] x = cut([fill(1, 5); fill(4, 5)], 2) - @test x == [fill("Q1: [1.0, 2.5)", 5); fill("Q2: [2.5, 4.0]", 5)] - @test levels(x) == ["Q1: [1.0, 2.5)", "Q2: [2.5, 4.0]"] + @test x == [fill("Q1: [1, 4)", 5); fill("Q2: [4, 4]", 5)] + @test levels(x) == ["Q1: [1, 4)", "Q2: [4, 4]"] @test_throws ArgumentError cut([fill(1, 5); fill(4, 5)], 3) x = cut([fill(1, 5); fill(4, 5)], 3, allowempty=true) - @test x == [fill("Q2: [1.0, 4.0)", 5); fill("Q3: [4.0, 4.0]", 5)] - @test levels(x) == ["Q1: (1.0, 1.0)", "Q2: [1.0, 4.0)", "Q3: [4.0, 4.0]"] + @test x == [fill("Q2: [1, 4)", 5); fill("Q3: [4, 4]", 5)] + @test levels(x) == ["Q1: (1, 1)", "Q2: [1, 4)", "Q3: [4, 4]"] end @testset "cut with -0.0" begin @@ -353,12 +362,21 @@ end @test levels(x) == ["[-Inf, 2.0)", "[2.0, 5.0]"] x = cut([1:5; Inf], 2) - @test x ≅ [fill("Q1: [1.0, 3.5)", 3); fill("Q2: [3.5, Inf]", 3)] - @test levels(x) == ["Q1: [1.0, 3.5)", "Q2: [3.5, Inf]"] + @test x ≅ [fill("Q1: [1.0, 4.0)", 3); fill("Q2: [4.0, Inf]", 3)] + @test levels(x) == ["Q1: [1.0, 4.0)", "Q2: [4.0, Inf]"] x = cut([1:5; -Inf], 2) - @test x ≅ [fill("Q1: [-Inf, 2.5)", 2); fill("Q2: [2.5, 5.0]", 3); "Q1: [-Inf, 2.5)"] - @test levels(x) == ["Q1: [-Inf, 2.5)", "Q2: [2.5, 5.0]"] + @test x ≅ [fill("Q1: [-Inf, 3.0)", 2); fill("Q2: [3.0, 5.0]", 3); "Q1: [-Inf, 3.0)"] + @test levels(x) == ["Q1: [-Inf, 3.0)", "Q2: [3.0, 5.0]"] +end + +@testset "cut when quantile falls exactly on a data value" begin + x = cut([11, 14, 43, 54, 54, 56, 73, 79, 84, 84], 3) + @test x == + ["Q1: [11, 54)", "Q1: [11, 54)", "Q1: [11, 54)", + "Q2: [54, 73)", "Q2: [54, 73)", "Q2: [54, 73)", + "Q3: [73, 84]", "Q3: [73, 84]", "Q3: [73, 84]", "Q3: [73, 84]"] + @test levels(x) == ["Q1: [11, 54)", "Q2: [54, 73)", "Q3: [73, 84]"] end end \ No newline at end of file From e4a13b149327743bd18b52423f4747ceae65f970 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Wed, 21 May 2025 18:59:20 +0200 Subject: [PATCH 21/25] Simplify default `cut` labels (#422) 1) The quantile number isn't needed in most cases in the label, and anyway it's shown when printing an ordered `CategoricalValue`. Only use it by default when `allowempty=true` to avoid data-dependent errors if there are duplicate levels. 2) Round breaks by default to a number of significant digits chosen by `sigdigits`. This number is increased if necessary for breaks to remain unique. This generates labels which are not completely correct as rounding may make the left break greater than a value which is included in the interval, but this is generally minor and expected. Taking the floor rather than rounding would be more correct, but it can generate unexpected labels due to floating point trickiness (e.g. `floor(0.0003, sigdigits=4)` gives 0.0002999). This is what R does. Add a deprecation to avoid breaking custom `labels` functions which did not accept `sigdigits`. --- Project.toml | 2 +- src/CategoricalArrays.jl | 4 +- src/extras.jl | 182 +++++++++++++++++++++++++++++++-------- test/15_extras.jl | 148 ++++++++++++++++++++----------- 4 files changed, 246 insertions(+), 90 deletions(-) diff --git a/Project.toml b/Project.toml index 4de04411..2c345ff7 100644 --- a/Project.toml +++ b/Project.toml @@ -28,7 +28,7 @@ CategoricalArraysStructTypesExt = "StructTypes" [compat] Arrow = "2" -Compat = "3.37, 4" +Compat = "3.47, 4.10" DataAPI = "1.6" JSON = "0.15, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21" JSON3 = "1.1.2" diff --git a/src/CategoricalArrays.jl b/src/CategoricalArrays.jl index e597c344..f3383645 100644 --- a/src/CategoricalArrays.jl +++ b/src/CategoricalArrays.jl @@ -11,10 +11,12 @@ module CategoricalArrays import DataAPI: unwrap export unwrap + using Compat + @compat public default_formatter, numbered_formatter + using DataAPI using Missings using Printf - import Compat # JuliaLang/julia#36810 if VERSION < v"1.5.2" diff --git a/src/extras.jl b/src/extras.jl index 3f27aba6..60f32a64 100644 --- a/src/extras.jl +++ b/src/extras.jl @@ -27,17 +27,67 @@ function fill_refs!(refs::AbstractArray, X::AbstractArray, end end +if VERSION >= v"1.10" + const CUT_FMT = Printf.Format("%.*g") +end + """ - default_formatter(from, to, i; leftclosed, rightclosed) + CategoricalArrays.default_formatter(from, to, i::Integer; + leftclosed::Bool, rightclosed::Bool, + sigdigits::Integer) -Provide the default label format for the `cut(x, breaks)` method. +Provide the default label format for the `cut(x, breaks)` method, +which is `"[from, to)"` if `leftclosed` is `true` and `"[from, to)"` otherwise. + +If they are floating points values, breaks are turned into to strings using +`@sprintf("%.*g", sigdigits, break)` +(or `to` using `@sprintf("%.*g", sigdigits, break)` for the last break). """ -default_formatter(from, to, i; leftclosed, rightclosed) = - string(leftclosed ? "[" : "(", from, ", ", to, rightclosed ? "]" : ")") +function default_formatter(from, to, i::Integer; + leftclosed::Bool, rightclosed::Bool, + sigdigits::Integer) + @static if VERSION >= v"1.10" + from_str = from isa AbstractFloat ? + Printf.format(CUT_FMT, sigdigits, from) : + string(from) + to_str = to isa AbstractFloat ? + Printf.format(CUT_FMT, sigdigits, to) : + string(to) + else + from_str = from isa AbstractFloat ? + Printf.format(Printf.Format("%.$(sigdigits)g"), from) : + string(from) + to_str = to isa AbstractFloat ? + Printf.format(Printf.Format("%.$(sigdigits)g"), to) : + string(to) + end + string(leftclosed ? "[" : "(", from_str, ", ", to_str, rightclosed ? "]" : ")") +end + +""" + CategoricalArrays.numbered_formatter(from, to, i::Integer; + leftclosed::Bool, rightclosed::Bool, + sigdigits::Integer) + +Provide the default label format for the `cut(x, ngroups)` method +when `allowempty=true`, which is `"i: [from, to)"` if `leftclosed` +is `true` and `"i: [from, to)"` otherwise. + +If they are floating points values, breaks are turned into to strings using +`@sprintf("%.*g", sigdigits, breaks)` +(or `to` using `@sprintf("%.*g", sigdigits, break)` for the last break). +""" +numbered_formatter(from, to, i::Integer; + leftclosed::Bool, rightclosed::Bool, + sigdigits::Integer) = + string(i, ": ", + default_formatter(from, to, i, leftclosed=leftclosed, rightclosed=rightclosed, + sigdigits=sigdigits)) @doc raw""" cut(x::AbstractArray, breaks::AbstractVector; labels::Union{AbstractVector,Function}, + sigdigits::Integer=3, extend::Union{Bool,Missing}=false, allowempty::Bool=false) Cut a numeric array into intervals at values `breaks` @@ -49,15 +99,25 @@ the last interval, which is closed on both ends, i.e. `[lower, upper]`. If `x` accepts missing values (i.e. `eltype(x) >: Missing`) the returned array will also accept them. +!!! note + For floating point data, breaks may be rounded to `sigdigits` significant digits + when generating interval labels, meaning that they may not reflect exactly the cutpoints + used. + # Keyword arguments * `extend::Union{Bool, Missing}=false`: when `false`, an error is raised if some values in `x` fall outside of the breaks; when `true`, breaks are automatically added to include all values in `x`; when `missing`, values outside of the breaks generate `missing` entries. * `labels::Union{AbstractVector, Function}`: a vector of strings, characters - or numbers giving the names to use for - the intervals; or a function `f(from, to, i; leftclosed, rightclosed)` that generates + or numbers giving the names to use for the intervals; or a function + `f(from, to, i::Integer; leftclosed::Bool, rightclosed::Bool, sigdigits::Integer)` that generates the labels from the left and right interval boundaries and the group index. Defaults to - `"[from, to)"` (or `"[from, to]"` for the rightmost interval if `extend == true`). + [`CategoricalArrays.default_formatter`](@ref), giving `"[from, to)"` (or `"[from, to]"` + for the rightmost interval if `extend == true`). +* `sigdigits::Integer=3`: the minimum number of significant digits to use in labels. + This value is increased automatically if necessary so that rounded breaks are unique. + Only used for floating point types and when `labels` is a function, in which case it + is passed to it as a keyword argument. * `allowempty::Bool=false`: when `false`, an error is raised if some breaks other than the last one appear multiple times, generating empty intervals; when `true`, duplicate breaks are allowed and the intervals they generate are kept as @@ -69,19 +129,19 @@ julia> using CategoricalArrays julia> cut(-1:0.5:1, [0, 1], extend=true) 5-element CategoricalArray{String,1,UInt32}: - "[-1.0, 0.0)" - "[-1.0, 0.0)" - "[0.0, 1.0]" - "[0.0, 1.0]" - "[0.0, 1.0]" + "[-1, 0)" + "[-1, 0)" + "[0, 1]" + "[0, 1]" + "[0, 1]" julia> cut(-1:0.5:1, 2) 5-element CategoricalArray{String,1,UInt32}: - "Q1: [-1.0, 0.0)" - "Q1: [-1.0, 0.0)" - "Q2: [0.0, 1.0]" - "Q2: [0.0, 1.0]" - "Q2: [0.0, 1.0]" + "[-1, 0)" + "[-1, 0)" + "[0, 1]" + "[0, 1]" + "[0, 1]" julia> cut(-1:0.5:1, 2, labels=["A", "B"]) 5-element CategoricalArray{String,1,UInt32}: @@ -114,15 +174,17 @@ julia> cut(-1:0.5:1, 3, labels=fmt) @inline function cut(x::AbstractArray, breaks::AbstractVector; extend::Union{Bool, Missing}=false, labels::Union{AbstractVector{<:SupportedTypes},Function}=default_formatter, + sigdigits::Integer=3, allowempty::Bool=false) - return _cut(x, breaks, extend, labels, allowempty) + return _cut(x, breaks, extend, labels, sigdigits, allowempty) end # Separate function for inferability (thanks to inlining of cut) function _cut(x::AbstractArray{T, N}, breaks::AbstractVector, extend::Union{Bool, Missing}, labels::Union{AbstractVector{<:SupportedTypes},Function}, - allowempty::Bool=false) where {T, N} + sigdigits::Integer, + allowempty::Bool) where {T, N} if !issorted(breaks) breaks = sort(breaks) end @@ -179,21 +241,60 @@ function _cut(x::AbstractArray{T, N}, breaks::AbstractVector, end end + # Find minimal number of digits so that distinct breaks remain so + if eltype(breaks) <: AbstractFloat + while true + local i + for outer i in 2:lastindex(breaks) + b1 = breaks[i-1] + b2 = breaks[i] + isequal(b1, b2) && continue + + @static if VERSION >= v"1.9" + b1_str = Printf.format(CUT_FMT, sigdigits, b1) + b2_str = Printf.format(CUT_FMT, sigdigits, b2) + else + b1_str = Printf.format(Printf.Format("%.$(sigdigits)g"), b1) + b2_str = Printf.format(Printf.Format("%.$(sigdigits)g"), b2) + end + if b1_str == b2_str + sigdigits += 1 + break + end + end + i == lastindex(breaks) && break + end + end n = length(breaks) n >= 2 || throw(ArgumentError("at least two breaks must be provided when extend is not true")) if labels isa Function from = breaks[1:n-1] to = breaks[2:n] - firstlevel = labels(from[1], to[1], 1, - leftclosed=!isequal(breaks[1], breaks[2]), rightclosed=false) + local firstlevel + try + firstlevel = labels(from[1], to[1], 1, + leftclosed=!isequal(breaks[1], breaks[2]), rightclosed=false, + sigdigits=sigdigits) + catch + # Support functions defined before v1.0, where sigdigits did not exist + Base.depwarn("`labels` function is now required to accept a `sigdigits` keyword argument", + :cut) + labels_orig = labels + labels = (from, to, i; leftclosed, rightclosed, sigdigits) -> + labels_orig(from, to, i; leftclosed, rightclosed) + firstlevel = labels_orig(from[1], to[1], 1, + leftclosed=!isequal(breaks[1], breaks[2]), rightclosed=false) + end levs = Vector{typeof(firstlevel)}(undef, n-1) levs[1] = firstlevel for i in 2:n-2 levs[i] = labels(from[i], to[i], i, - leftclosed=!isequal(breaks[i], breaks[i+1]), rightclosed=false) + leftclosed=!isequal(breaks[i], breaks[i+1]), rightclosed=false, + sigdigits=sigdigits) end levs[end] = labels(from[end], to[end], n-1, - leftclosed=true, rightclosed=true) + leftclosed=true, rightclosed=true, + sigdigits=sigdigits) else length(labels) == n-1 || throw(ArgumentError("labels must be of length $(n-1), but got length $(length(labels))")) @@ -213,14 +314,6 @@ function _cut(x::AbstractArray{T, N}, breaks::AbstractVector, CategoricalArray{S, N}(refs, pool) end -""" - quantile_formatter(from, to, i; leftclosed, rightclosed) - -Provide the default label format for the `cut(x, ngroups)` method. -""" -quantile_formatter(from, to, i; leftclosed, rightclosed) = - string("Q", i, ": ", leftclosed ? "[" : "(", from, ", ", to, rightclosed ? "]" : ")") - """ Find first value in (sorted) `v` which is greater than or equal to each quantile in (sorted) `qs`. @@ -247,6 +340,7 @@ end """ cut(x::AbstractArray, ngroups::Integer; labels::Union{AbstractVector{<:AbstractString},Function}, + sigdigits::Integer=3, allowempty::Bool=false) Cut a numeric array into `ngroups` quantiles. @@ -257,19 +351,32 @@ but breaks are taken from actual data values instead of estimated quantiles. If `x` contains `missing` values, they are automatically skipped when computing quantiles. +!!! note + For floating point data, breaks may be rounded to `sigdigits` significant digits + when generating interval labels, meaning that they may not reflect exactly the cutpoints + used. + # Keyword arguments * `labels::Union{AbstractVector, Function}`: a vector of strings, characters - or numbers giving the names to use for - the intervals; or a function `f(from, to, i; leftclosed, rightclosed)` that generates + or numbers giving the names to use for the intervals; or a function + `f(from, to, i::Integer; leftclosed::Bool, rightclosed::Bool, sigdigits::Integer)` that generates the labels from the left and right interval boundaries and the group index. Defaults to - `"Qi: [from, to)"` (or `"Qi: [from, to]"` for the rightmost interval). + [`CategoricalArrays.default_formatter`](@ref), giving `"[from, to)"` (or `"[from, to]"` + for the rightmost interval if `extend == true`) if `allowempty=false`, otherwise to + [`CategoricalArrays.numbered_formatter`](@ref), which prefixes the label with the quantile + number to ensure uniqueness. +* `sigdigits::Integer=3`: the minimum number of significant digits to use when rounding + breaks for inclusion in generated labels. This value is increased automatically if necessary + so that rounded breaks are unique. Only used for floating point types and when `labels` is a + function, in which case it is passed to it as a keyword argument. * `allowempty::Bool=false`: when `false`, an error is raised if some quantiles breakpoints other than the last one are equal, generating empty intervals; when `true`, duplicate breaks are allowed and the intervals they generate are kept as unused levels (but duplicate labels are not allowed). """ function cut(x::AbstractArray, ngroups::Integer; - labels::Union{AbstractVector{<:SupportedTypes},Function}=quantile_formatter, + labels::Union{AbstractVector{<:SupportedTypes},Function,Nothing}=nothing, + sigdigits::Integer=3, allowempty::Bool=false) ngroups >= 1 || throw(ArgumentError("ngroups must be strictly positive (got $ngroups)")) sorted_x = eltype(x) >: Missing ? sort!(collect(skipmissing(x))) : sort(x) @@ -286,5 +393,8 @@ function cut(x::AbstractArray, ngroups::Integer; "Pass `allowempty=true` to allow empty quantiles or " * "choose a lower value for `ngroups`.")) end - cut(x, breaks; labels=labels, allowempty=allowempty) + if labels === nothing + labels = allowempty ? numbered_formatter : default_formatter + end + return cut(x, breaks; labels=labels, sigdigits=sigdigits, allowempty=allowempty) end diff --git a/test/15_extras.jl b/test/15_extras.jl index af4f79f5..5df7860b 100644 --- a/test/15_extras.jl +++ b/test/15_extras.jl @@ -93,10 +93,10 @@ const ≅ = isequal @test levels(x) == ["b", "a"] x = @inferred cut(Matrix{Union{Float64, T}}([-1.1 3.0; 1.456 10.394]), [-2.134, 3.0, 12.5]) - @test x == ["[-2.134, 3.0)" "[3.0, 12.5]"; "[-2.134, 3.0)" "[3.0, 12.5]"] + @test x == ["[-2.13, 3)" "[3, 12.5]"; "[-2.13, 3)" "[3, 12.5]"] @test isa(x, CategoricalMatrix{Union{String, T}}) @test isordered(x) - @test levels(x) == ["[-2.134, 3.0)", "[3.0, 12.5]"] + @test levels(x) == ["[-2.13, 3)", "[3, 12.5]"] labels = 0:2:8 x = @inferred cut(Vector{Union{T, Int}}(1:8), 0:2:10, labels=labels) @@ -127,18 +127,18 @@ end @testset "cut([5, 4, 3, 2], 2)" begin x = @inferred cut([5, 4, 3, 2], 2) - @test x == ["Q2: [4, 5]", "Q2: [4, 5]", "Q1: [2, 4)", "Q1: [2, 4)"] + @test x == ["[4, 5]", "[4, 5]", "[2, 4)", "[2, 4)"] @test isa(x, CategoricalArray) @test isordered(x) - @test levels(x) == ["Q1: [2, 4)", "Q2: [4, 5]"] + @test levels(x) == ["[2, 4)", "[4, 5]"] end @testset "cut(x, n) with missing values" begin x = @inferred cut([5, 4, 3, missing, 2], 2) - @test x ≅ ["Q2: [4, 5]", "Q2: [4, 5]", "Q1: [2, 4)", missing, "Q1: [2, 4)"] + @test x ≅ ["[4, 5]", "[4, 5]", "[2, 4)", missing, "[2, 4)"] @test isa(x, CategoricalArray) @test isordered(x) - @test levels(x) == ["Q1: [2, 4)", "Q2: [4, 5]"] + @test levels(x) == ["[2, 4)", "[4, 5]"] end @testset "cut(x, n) with invalid n" begin @@ -147,7 +147,7 @@ end end @testset "cut with formatter function" begin - my_formatter(from, to, i; leftclosed, rightclosed) = "$i: $from -- $to" + my_formatter(from, to, i; leftclosed, rightclosed, sigdigits) = "$i: $from -- $to" x = 0.15:0.20:0.95 p = [0, 0.4, 0.8, 1.0] @@ -155,20 +155,24 @@ end a = @inferred cut(x, p, labels=my_formatter) @test a == ["1: 0.0 -- 0.4", "1: 0.0 -- 0.4", "2: 0.4 -- 0.8", "2: 0.4 -- 0.8", "3: 0.8 -- 1.0"] + my_old_formatter(from, to, i; leftclosed, rightclosed) = "$i: $from -- $to" + a = @test_deprecated r"`labels`.*" cut(x, p, labels=my_old_formatter) + @test a == ["1: 0.0 -- 0.4", "1: 0.0 -- 0.4", "2: 0.4 -- 0.8", "2: 0.4 -- 0.8", "3: 0.8 -- 1.0"] + # GH 274 - my_formatter_2(from, to, i; leftclosed, rightclosed) = "$i: $(from+1) -- $(to+1)" + my_formatter_2(from, to, i; leftclosed, rightclosed, sigdigits) = "$i: $(from+1) -- $(to+1)" a = @inferred cut(x, p, labels=my_formatter_2) @test a == ["1: 1.0 -- 1.4", "1: 1.0 -- 1.4", "2: 1.4 -- 1.8", "2: 1.4 -- 1.8", "3: 1.8 -- 2.0"] for T in (Union{}, Missing) - labels = (from, to, i; leftclosed, rightclosed) -> (to+from)/2 + labels = (from, to, i; leftclosed, rightclosed, sigdigits) -> (to+from)/2 a = @inferred cut(Vector{Union{T, Int}}(1:8), 0:2:10, labels=labels) @test a == [1.0, 3.0, 3.0, 5.0, 5.0, 7.0, 7.0, 9.0] @test isa(a, CategoricalVector{Union{Float64, T}}) @test isordered(a) @test levels(a) == [1.0, 3.0, 5.0, 7.0, 9.0] - labels = (from, to, i; leftclosed, rightclosed) -> "$((to+from)/2)" + labels = (from, to, i; leftclosed, rightclosed, sigdigits) -> "$((to+from)/2)" a = @inferred cut(Vector{Union{T, Int}}(1:8), 0:2:10, labels=labels) @test a == string.([1.0, 3.0, 3.0, 5.0, 5.0, 7.0, 7.0, 9.0]) @test isa(a, CategoricalVector{Union{String, T}}) @@ -188,8 +192,8 @@ end @test_throws ArgumentError cut(x, [0, 0.1, 0.1, 10]) @test_throws ArgumentError cut(x, 10) y = cut(x, [0, 0.1, 10, 10]) - @test y == [fill("[0.0, 0.1)", 10); fill("[0.1, 10.0)", 10)] - @test levels(y) == ["[0.0, 0.1)", "[0.1, 10.0)", "[10.0, 10.0]"] + @test y == [fill("[0, 0.1)", 10); fill("[0.1, 10)", 10)] + @test levels(y) == ["[0, 0.1)", "[0.1, 10)", "[10, 10]"] @test_throws ArgumentError cut(1:10, [1, 5, 5, 11]) y = cut(1:10, [1, 5, 5, 11], allowempty=true) @@ -251,55 +255,55 @@ end @test_throws ArgumentError cut(1:8, 0:2:10, labels=[0, 1, 1, 2, 3]) @test_throws ArgumentError cut(1:8, [0, 2, 2, 6, 8, 10], labels=[0, 1, 1, 2, 3], allowempty=true) - fmt = (from, to, i; leftclosed, rightclosed) -> (i % 2 == 0 ? to : 0.0) + fmt = (from, to, i; leftclosed, rightclosed, sigdigits) -> (i % 2 == 0 ? to : 0.0) @test_throws ArgumentError cut(1:8, 0:2:10, labels=fmt) @test_throws ArgumentError cut([fill(1, 10); 4], 2) x = cut([fill(1, 10); 4], 2, allowempty=true) - @test unique(x) == ["Q2: [1, 4]"] - @test levels(x) == ["Q1: (1, 1)", "Q2: [1, 4]"] + @test unique(x) == ["2: [1, 4]"] + @test levels(x) == ["1: (1, 1)", "2: [1, 4]"] @test_throws ArgumentError cut([fill(1, 10); 4], 3) x = cut([fill(1, 10); 4], 3, allowempty=true) - @test unique(x) == ["Q3: [1, 4]"] - @test levels(x) == ["Q1: (1, 1)", "Q2: (1, 1)", "Q3: [1, 4]"] + @test unique(x) == ["3: [1, 4]"] + @test levels(x) == ["1: (1, 1)", "2: (1, 1)", "3: [1, 4]"] x = cut([fill(4, 10); 1], 2) - @test x == [fill("Q2: [4, 4]", 10); "Q1: [1, 4)"] - @test levels(x) == ["Q1: [1, 4)"; "Q2: [4, 4]"] + @test x == [fill("[4, 4]", 10); "[1, 4)"] + @test levels(x) == ["[1, 4)"; "[4, 4]"] @test_throws ArgumentError cut([fill(4, 10); 1], 3) x = cut([fill(4, 10); 1], 3, allowempty=true) - @test x == [fill("Q3: [4, 4]", 10); "Q1: [1, 4)"] - @test levels(x) == ["Q1: [1, 4)", "Q2: (4, 4)", "Q3: [4, 4]"] + @test x == [fill("3: [4, 4]", 10); "1: [1, 4)"] + @test levels(x) == ["1: [1, 4)", "2: (4, 4)", "3: [4, 4]"] x = cut([fill(1, 5); fill(4, 5)], 2) - @test x == [fill("Q1: [1, 4)", 5); fill("Q2: [4, 4]", 5)] - @test levels(x) == ["Q1: [1, 4)", "Q2: [4, 4]"] + @test x == [fill("[1, 4)", 5); fill("[4, 4]", 5)] + @test levels(x) == ["[1, 4)", "[4, 4]"] @test_throws ArgumentError cut([fill(1, 5); fill(4, 5)], 3) x = cut([fill(1, 5); fill(4, 5)], 3, allowempty=true) - @test x == [fill("Q2: [1, 4)", 5); fill("Q3: [4, 4]", 5)] - @test levels(x) == ["Q1: (1, 1)", "Q2: [1, 4)", "Q3: [4, 4]"] + @test x == [fill("2: [1, 4)", 5); fill("3: [4, 4]", 5)] + @test levels(x) == ["1: (1, 1)", "2: [1, 4)", "3: [4, 4]"] end @testset "cut with -0.0" begin x = cut([-0.0, 0.0, 0.0, -0.0], 2) - @test x == ["Q1: [-0.0, 0.0)", "Q2: [0.0, 0.0]", "Q2: [0.0, 0.0]", "Q1: [-0.0, 0.0)"] - @test levels(x) == ["Q1: [-0.0, 0.0)", "Q2: [0.0, 0.0]"] + @test x == ["[-0, 0)", "[0, 0]", "[0, 0]", "[-0, 0)"] + @test levels(x) == ["[-0, 0)", "[0, 0]"] x = cut([-0.0, 0.0, 0.0, -0.0], [-0.0, 0.0, 0.0]) - @test x == ["[-0.0, 0.0)", "[0.0, 0.0]", "[0.0, 0.0]", "[-0.0, 0.0)"] - @test levels(x) == ["[-0.0, 0.0)", "[0.0, 0.0]"] + @test x == ["[-0, 0)", "[0, 0]", "[0, 0]", "[-0, 0)"] + @test levels(x) == ["[-0, 0)", "[0, 0]"] x = cut([-0.0, 0.0, 0.0, -0.0], [-0.0, 0.0]) - @test x == fill("[-0.0, 0.0]", 4) - @test levels(x) == ["[-0.0, 0.0]"] + @test x == fill("[-0, 0]", 4) + @test levels(x) == ["[-0, 0]"] x = cut([-0.0, 0.0, 0.0, -0.0], [0.0], extend=true) - @test x == fill("[-0.0, 0.0]", 4) - @test levels(x) == ["[-0.0, 0.0]"] + @test x == fill("[-0, 0]", 4) + @test levels(x) == ["[-0, 0]"] x = cut([-0.0, 0.0, 0.0, -0.0], [-0.0], extend=true) - @test x == fill("[-0.0, 0.0]", 4) - @test levels(x) == ["[-0.0, 0.0]"] + @test x == fill("[-0, 0]", 4) + @test levels(x) == ["[-0, 0]"] x = cut([-0.0, 0.0, 0.0, -0.0], 2, labels=[-0.0, 0.0]) @test x == [-0.0, 0.0, 0.0, -0.0] @@ -336,7 +340,7 @@ end @test levels(x) == [-0.0, 0.0] x = @inferred cut(-1:0.5:1, [0, 1], extend=true) - @test x == ["[-1.0, 0.0)", "[-1.0, 0.0)", "[0.0, 1.0]", "[0.0, 1.0]", "[0.0, 1.0]"] + @test x == ["[-1, 0)", "[-1, 0)", "[0, 1]", "[0, 1]", "[0, 1]"] end @testset "cut with NaN and Inf" begin @@ -346,37 +350,77 @@ end @test_throws ArgumentError("NaN values are not allowed in breaks") cut([1, 2], [1, NaN]) x = cut([1, Inf], [1], extend=true) - @test x ≅ ["[1.0, Inf]", "[1.0, Inf]"] - @test levels(x) == ["[1.0, Inf]"] + @test x ≅ ["[1, Inf]", "[1, Inf]"] + @test levels(x) == ["[1, Inf]"] x = cut([1, -Inf], [1], extend=true) - @test x ≅ ["[-Inf, 1.0]", "[-Inf, 1.0]"] - @test levels(x) == ["[-Inf, 1.0]"] + @test x ≅ ["[-Inf, 1]", "[-Inf, 1]"] + @test levels(x) == ["[-Inf, 1]"] x = cut([1:5; Inf], [1, 2, Inf]) - @test x ≅ ["[1.0, 2.0)"; fill("[2.0, Inf]", 5)] - @test levels(x) == ["[1.0, 2.0)", "[2.0, Inf]"] + @test x ≅ ["[1, 2)"; fill("[2, Inf]", 5)] + @test levels(x) == ["[1, 2)", "[2, Inf]"] x = cut([1:5; -Inf], [-Inf, 2, 5]) - @test x ≅ ["[-Inf, 2.0)"; fill("[2.0, 5.0]", 4); "[-Inf, 2.0)"] - @test levels(x) == ["[-Inf, 2.0)", "[2.0, 5.0]"] + @test x ≅ ["[-Inf, 2)"; fill("[2, 5]", 4); "[-Inf, 2)"] + @test levels(x) == ["[-Inf, 2)", "[2, 5]"] x = cut([1:5; Inf], 2) - @test x ≅ [fill("Q1: [1.0, 4.0)", 3); fill("Q2: [4.0, Inf]", 3)] - @test levels(x) == ["Q1: [1.0, 4.0)", "Q2: [4.0, Inf]"] + @test x ≅ [fill("[1, 4)", 3); fill("[4, Inf]", 3)] + @test levels(x) == ["[1, 4)", "[4, Inf]"] x = cut([1:5; -Inf], 2) - @test x ≅ [fill("Q1: [-Inf, 3.0)", 2); fill("Q2: [3.0, 5.0]", 3); "Q1: [-Inf, 3.0)"] - @test levels(x) == ["Q1: [-Inf, 3.0)", "Q2: [3.0, 5.0]"] + @test x ≅ [fill("[-Inf, 3)", 2); fill("[3, 5]", 3); "[-Inf, 3)"] + @test levels(x) == ["[-Inf, 3)", "[3, 5]"] end @testset "cut when quantile falls exactly on a data value" begin x = cut([11, 14, 43, 54, 54, 56, 73, 79, 84, 84], 3) @test x == - ["Q1: [11, 54)", "Q1: [11, 54)", "Q1: [11, 54)", - "Q2: [54, 73)", "Q2: [54, 73)", "Q2: [54, 73)", - "Q3: [73, 84]", "Q3: [73, 84]", "Q3: [73, 84]", "Q3: [73, 84]"] - @test levels(x) == ["Q1: [11, 54)", "Q2: [54, 73)", "Q3: [73, 84]"] + ["[11, 54)", "[11, 54)", "[11, 54)", + "[54, 73)", "[54, 73)", "[54, 73)", + "[73, 84]", "[73, 84]", "[73, 84]", "[73, 84]"] + @test levels(x) == ["[11, 54)", "[54, 73)", "[73, 84]"] +end + +@testset "cut computation of sigdigits" begin + x = cut([1.2, 1.3, 2], 2) + @test levels(x) == ["[1.2, 1.3)", "[1.3, 2]"] + + x = cut([1.0, 2.0, 3.0], 2) + @test levels(x) == ["[1, 2)", "[2, 3]"] + + x = cut([1.00002, 1.00003, 2], 2) + @test levels(x) == ["[1.00002, 1.00003)", "[1.00003, 2]"] + + x = cut([1.00002, 1.00003, 1.00005, 2], 2) + @test levels(x) == ["[1, 1.0001)", "[1.0001, 2]"] + + x = cut([1.00001, 1.00002, 1.00002, 2], 2) + @test levels(x) == ["[1.00001, 1.00002)", "[1.00002, 2]"] + + x = cut([1.00001, 1.00003, 1.1, 2], 2) + @test levels(x) == ["[1, 1.1)", "[1.1, 2]"] + + # @sprintf with %g uses scientific notation even in some cases + # where classic notation would be shorter + x = cut([1.0, 10.0, 100.0, 1000.0], [1.0, 10.0, 100.0, 1000.0]) + @test levels(x) == ["[1, 10)", "[10, 100)", "[100, 1e+03]"] + # But integers are rendered using plain `string` + x = cut([1, 10, 100], [1, 10, 100, 1000]) + @test levels(x) == ["[1, 10)", "[10, 100)", "[100, 1000]"] + + # Extreme case + x = cut([8.85718832925723e-7, 8.572446994052413e-7, 1.40217695121027e-7, 8.966449714804087e-7, + 3.070384341319470e-7, 3.070384341319471e-7, 1.8520709563325888e-7, 5.630461710066611e-7, + 6.781422109070843e-7, 4.776113711396994e-7, 0.2538909094146984, 0.5249665525921473, + 0.8321957380046366, 0.9648282851978118, 0.36084175275805797, 0.7851054639425253, + 0.6875195857202754, 0.614940093507575, 0.6224944997292978, 0.6055683461790675, + 5.349085340927365e11, 1.3471583229449602e11, 6.538893396835975e11, 4.826316844547661e11, + 8.803607035550856e11, 1.8174694671397316e10, 1.6709745443719125e11, 3.2050577954311835e11, + 1.6134999167460663e11, 7.396308745225059e11], 3) + @test levels(x) == ["[1.4e-07, 0.254)", "[0.254, 1.82e+10)", "[1.82e+10, 8.8e+11]"] + end end \ No newline at end of file From 11d43c1bc1c2b67046867fd9bca318443fa46a25 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Wed, 21 May 2025 19:35:31 +0200 Subject: [PATCH 22/25] Support weighted quantiles in `cut` (#423) This requires adding an extension point for StatsBase. Unfortunately more copies of the data and weights are done than necessary as StatsBase does not support in-place weighted quantile! on pre-sorted data nor taking a view of weights vectors (JuliaStats/StatsBase.jl#723). --- Project.toml | 6 +++- ext/CategoricalArraysStatsBaseExt.jl | 13 ++++++++ src/CategoricalArrays.jl | 1 + src/extras.jl | 44 +++++++++++++++++++++++----- test/15_extras.jl | 24 +++++++++++++++ 5 files changed, 79 insertions(+), 9 deletions(-) create mode 100644 ext/CategoricalArraysStatsBaseExt.jl diff --git a/Project.toml b/Project.toml index 2c345ff7..a9262e93 100644 --- a/Project.toml +++ b/Project.toml @@ -16,6 +16,7 @@ Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01" +StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" SentinelArrays = "91c51154-3ec4-41a3-a24f-3f23e20d615c" StructTypes = "856f2bd8-1eba-4b0a-8007-ebc267875bd4" @@ -23,6 +24,7 @@ StructTypes = "856f2bd8-1eba-4b0a-8007-ebc267875bd4" CategoricalArraysArrowExt = "Arrow" CategoricalArraysJSONExt = "JSON" CategoricalArraysRecipesBaseExt = "RecipesBase" +CategoricalArraysStatsBaseExt = "StatsBase" CategoricalArraysSentinelArraysExt = "SentinelArrays" CategoricalArraysStructTypesExt = "StructTypes" @@ -37,6 +39,7 @@ RecipesBase = "1.1" Requires = "1" SentinelArrays = "1" Statistics = "1" +StatsBase = "0.15, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.30, 0.31, 0.32, 0.33, 0.34" StructTypes = "1" julia = "1.6" @@ -49,8 +52,9 @@ PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01" RecipesPipeline = "01d81517-befc-4cb6-b9ec-a95719d0359c" SentinelArrays = "91c51154-3ec4-41a3-a24f-3f23e20d615c" +StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" StructTypes = "856f2bd8-1eba-4b0a-8007-ebc267875bd4" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Arrow", "Dates", "JSON", "JSON3", "PooledArrays", "RecipesBase", "RecipesPipeline", "SentinelArrays", "StructTypes", "Test"] +test = ["Arrow", "Dates", "JSON", "JSON3", "PooledArrays", "RecipesBase", "RecipesPipeline", "SentinelArrays", "StatsBase", "StructTypes", "Test"] diff --git a/ext/CategoricalArraysStatsBaseExt.jl b/ext/CategoricalArraysStatsBaseExt.jl new file mode 100644 index 00000000..8cbd5c61 --- /dev/null +++ b/ext/CategoricalArraysStatsBaseExt.jl @@ -0,0 +1,13 @@ +module CategoricalArraysStatsBaseExt + +if isdefined(Base, :get_extension) + import CategoricalArrays: _wquantile + using StatsBase +else + import ..CategoricalArrays: _wquantile + using ..StatsBase +end + +_wquantile(x::AbstractArray, w::AbstractWeights, p::AbstractVector) = quantile(x, w, p) + +end diff --git a/src/CategoricalArrays.jl b/src/CategoricalArrays.jl index f3383645..f44b3c2f 100644 --- a/src/CategoricalArrays.jl +++ b/src/CategoricalArrays.jl @@ -45,6 +45,7 @@ module CategoricalArrays @require JSON="682c06a0-de6a-54ab-a142-c8b1cf79cde6" include("../ext/CategoricalArraysJSONExt.jl") @require RecipesBase="3cdcf5f2-1ef4-517c-9805-6587b60abb01" include("../ext/CategoricalArraysRecipesBaseExt.jl") @require SentinelArrays="91c51154-3ec4-41a3-a24f-3f23e20d615c" include("../ext/CategoricalArraysSentinelArraysExt.jl") + @require StatsBase="2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" include("../ext/CategoricalArraysStatsBaseExt.jl") @require StructTypes="856f2bd8-1eba-4b0a-8007-ebc267875bd4" include("../ext/CategoricalArraysStructTypesExt.jl") end end diff --git a/src/extras.jl b/src/extras.jl index 60f32a64..910c6e46 100644 --- a/src/extras.jl +++ b/src/extras.jl @@ -337,11 +337,17 @@ function find_breaks(v::AbstractVector, qs::AbstractVector) return breaks end +# AbstractWeights method is defined in StatsBase extension +# There is no in-place weighted quantile method in StatsBase +_wquantile(x::AbstractArray, w::AbstractVector, p::AbstractVector) = + throw(ArgumentError("`weights` must be an `AbstractWeights` vector from StatsBase.jl")) + """ cut(x::AbstractArray, ngroups::Integer; labels::Union{AbstractVector{<:AbstractString},Function}, sigdigits::Integer=3, - allowempty::Bool=false) + allowempty::Bool=false, + weights::Union{AbstractWeights, Nothing}=nothing) Cut a numeric array into `ngroups` quantiles. @@ -373,19 +379,41 @@ quantiles. other than the last one are equal, generating empty intervals; when `true`, duplicate breaks are allowed and the intervals they generate are kept as unused levels (but duplicate labels are not allowed). +* `weights::Union{AbstractWeights, Nothing}=nothing`: observations weights to used when + computing quantiles (see `quantile` documentation in StatsBase). """ function cut(x::AbstractArray, ngroups::Integer; labels::Union{AbstractVector{<:SupportedTypes},Function,Nothing}=nothing, sigdigits::Integer=3, - allowempty::Bool=false) + allowempty::Bool=false, + weights::Union{AbstractVector, Nothing}=nothing) ngroups >= 1 || throw(ArgumentError("ngroups must be strictly positive (got $ngroups)")) - sorted_x = eltype(x) >: Missing ? sort!(collect(skipmissing(x))) : sort(x) - min_x, max_x = first(sorted_x), last(sorted_x) - if (min_x isa Number && isnan(min_x)) || - (max_x isa Number && isnan(max_x)) - throw(ArgumentError("NaN values are not allowed in input vector")) + if weights === nothing + sorted_x = eltype(x) >: Missing ? sort!(collect(skipmissing(x))) : sort(x) + min_x, max_x = first(sorted_x), last(sorted_x) + if (min_x isa Number && isnan(min_x)) || + (max_x isa Number && isnan(max_x)) + throw(ArgumentError("NaN values are not allowed in input vector")) + end + qs = quantile!(sorted_x, (1:(ngroups-1))/ngroups, sorted=true) + else + if eltype(x) >: Missing + nm_inds = findall(!ismissing, x) + nm_x = view(x, nm_inds) + # TODO: use a view once this is supported (JuliaStats/StatsBase.jl#723) + nm_weights = weights[nm_inds] + else + nm_x = x + nm_weights = weights + end + sorted_x = sort(nm_x) + min_x, max_x = first(sorted_x), last(sorted_x) + if (min_x isa Number && isnan(min_x)) || + (max_x isa Number && isnan(max_x)) + throw(ArgumentError("NaN values are not allowed in input vector")) + end + qs = _wquantile(nm_x, nm_weights, (1:(ngroups-1))/ngroups) end - qs = quantile!(sorted_x, (1:(ngroups-1))/ngroups, sorted=true) breaks = [min_x; find_breaks(sorted_x, qs); max_x] if !allowempty && !allunique(@view breaks[1:end-1]) throw(ArgumentError("cannot compute $ngroups quantiles due to " * diff --git a/test/15_extras.jl b/test/15_extras.jl index 5df7860b..80dc14b7 100644 --- a/test/15_extras.jl +++ b/test/15_extras.jl @@ -1,6 +1,8 @@ module TestExtras using Test using CategoricalArrays +using StatsBase +using Missings const ≅ = isequal @@ -423,4 +425,26 @@ end end +@testset "cut with weighted quantiles" begin + @test_throws ArgumentError cut(1:3, 3, weights=1:3) + + x = collect(Float64, 1:100) + w = fweights(repeat(1:10, inner=10)) + y = cut(x, 10, weights=w) + @test levelcode.(y) == levelcode.(cut(x, quantile(x, w, (0:10)./10))) + @test levels(y) == ["[1, 29)", "[29, 43)", "[43, 53)", "[53, 62)", "[62, 70)", + "[70, 77)", "[77, 83)", "[83, 89)", "[89, 95)", "[95, 100]"] + + mx = allowmissing(x) + mx[2] = mx[10] = missing + nm_inds = .!ismissing.(mx) + y = cut(mx, 10, weights=w) + @test levelcode.(y) ≅ levelcode.(cut(mx, quantile(x[nm_inds], w[nm_inds], (0:10)./10))) + @test levels(y) == ["[1, 30)", "[30, 43)", "[43, 53)", "[53, 62)", "[62, 70)", + "[70, 77)", "[77, 83)", "[83, 89)", "[89, 95)", "[95, 100]"] + + x[5] = NaN + @test_throws ArgumentError cut(x, 3, weights=w) +end + end \ No newline at end of file From 13a9bad33f1500b950f355e837ba546d428c0e38 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Fri, 23 May 2025 23:32:51 +0200 Subject: [PATCH 23/25] Add `Array` constructors and `convert` methods (#420) Consistent with existing `similar` methods and the `Array` constructor, ensure `T(::CategoricalArray{U})` and `convert(T, ::CategoricalArray{U})` return an `Array{U}` for `T` in `Array`, `Vector`, `Matrix`. Same for `SubArray`s of `CategoricalArray`s. This avoids creating `Array{<:CategoricalValue}` objects which are inefficient and unlikely to be what users want. --- src/array.jl | 17 +++++-- test/13_arraycommon.jl | 100 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 112 insertions(+), 5 deletions(-) diff --git a/src/array.jl b/src/array.jl index c462e7d4..8101fb56 100644 --- a/src/array.jl +++ b/src/array.jl @@ -1,6 +1,7 @@ ## Code for CategoricalArray -import Base: Array, convert, collect, copy, getindex, setindex!, similar, size, +import Base: Array, Vector, Matrix, convert, collect, copy, getindex, + setindex!, similar, size, unique, unique!, vcat, in, summary, float, complex, copyto! # Used for keyword argument default value @@ -410,6 +411,12 @@ convert(::Type{CategoricalArray{T, N}}, A::CategoricalArray{T, N}) where {T, N} convert(::Type{CategoricalArray{T}}, A::CategoricalArray{T}) where {T} = A convert(::Type{CategoricalArray}, A::CategoricalArray) = A +convert(::Type{Array{S, N}}, A::CatArrOrSub{T, N}) where {S, T, N} = + collect(S, A) +convert(::Type{Array}, A::CatArrOrSub) = unwrap.(A) +convert(::Type{Vector}, A::CatArrOrSub) = unwrap.(A) +convert(::Type{Matrix}, A::CatArrOrSub) = unwrap.(A) + function Base.:(==)(A::CategoricalArray{S}, B::CategoricalArray{T}) where {S, T} if size(A) != size(B) return false @@ -1048,8 +1055,10 @@ function in(x::CategoricalValue, y::CategoricalArray{T, N, R}) where {T, N, R} end end -Array(A::CategoricalArray{T}) where {T} = Array{T}(A) -collect(A::CategoricalArray) = copy(A) +Array(A::CatArrOrSub{T}) where {T} = Array{T}(A) +Vector(A::CatArrOrSub{T}) where {T} = Vector{T}(A) +Matrix(A::CatArrOrSub{T}) where {T} = Matrix{T}(A) +collect(A::CatArrOrSub) = copy(A) # Defined for performance collect(x::Base.SkipMissing{<: CatArrOrSub{T}}) where {T} = @@ -1119,7 +1128,7 @@ function Base.sort!(v::CategoricalVector; levs = eltype(v) >: Missing ? eltype(v)[i == 0 ? missing : CategoricalValue(v.pool, i) for i in 0:length(v.pool)] : eltype(v)[CategoricalValue(v.pool, i) for i in 1:length(v.pool)] - sortedlevs = sort!(Vector(view(levs, seen)), order=ord) + sortedlevs = sort!(Vector{eltype(levs)}(view(levs, seen)), order=ord) levelsmap = something.(indexin(sortedlevs, levs)) j = 0 refs = v.refs diff --git a/test/13_arraycommon.jl b/test/13_arraycommon.jl index 02b51bd7..2cf3ff6b 100644 --- a/test/13_arraycommon.jl +++ b/test/13_arraycommon.jl @@ -1330,18 +1330,116 @@ end @test levels(x) == [2, 1, 3, 4] end -@testset "Array(::CategoricalArray{T}) produces Array{T}" begin +@testset "Array(::CatArrOrSub{T}) produces Array{T}" begin x = [1,1,2,2] y = categorical(x) z = Array(y) @test typeof(x) == typeof(z) @test z == x + z = Array(view(x, 1:4)) + @test typeof(x) == typeof(z) + @test z == x x = [1,1,2,missing] y = categorical(x) z = Array(y) @test typeof(x) == typeof(z) @test z ≅ x + z = Array(view(x, 1:4)) + @test typeof(x) == typeof(z) + @test z ≅ x + + x = [1,1,2,2] + y = categorical(x) + z = Vector(y) + @test typeof(x) == typeof(z) + @test z == x + z = Vector(view(x, 1:4)) + @test typeof(x) == typeof(z) + @test z == x + + x = [1,1,2,missing] + y = categorical(x) + z = Vector(y) + @test typeof(x) == typeof(z) + @test z ≅ x + z = Vector(view(x, 1:4)) + @test typeof(x) == typeof(z) + @test z ≅ x + + x = [1 1 2 2] + y = categorical(x) + z = Matrix(y) + @test typeof(x) == typeof(z) + @test z == x + z = Matrix(view(x, :, 1:4)) + @test typeof(x) == typeof(z) + @test z == x + + x = [1 1 2 missing] + y = categorical(x) + z = Matrix(y) + @test typeof(x) == typeof(z) + @test z ≅ x + z = Matrix(view(x, :, 1:4)) + @test typeof(x) == typeof(z) + @test z ≅ x +end + +@testset "convert(Array, ::CatArrOrSub{T}) produces Array{T}" begin + x = [1,1,2,2] + y = categorical(x) + z = convert(Array, y) + @test typeof(x) == typeof(z) + @test z == x + z = Array(view(x, 1:4)) + @test typeof(x) == typeof(z) + @test z == x + + x = [1,1,2,missing] + y = categorical(x) + z = convert(Array, y) + @test typeof(x) == typeof(z) + @test z ≅ x + z = Array(view(x, 1:4)) + @test typeof(x) == typeof(z) + @test z ≅ x + + x = [1,1,2,2] + y = categorical(x) + z = convert(Vector, y) + @test typeof(x) == typeof(z) + @test z == x + z = Vector(view(x, 1:4)) + @test typeof(x) == typeof(z) + @test z == x + + x = [1,1,2,missing] + y = categorical(x) + z = convert(Vector, y) + @test typeof(x) == typeof(z) + @test z ≅ x + z = Vector(view(x, 1:4)) + @test typeof(x) == typeof(z) + @test z ≅ x + + x = [1 1 2 2] + y = categorical(x) + z = convert(Matrix, y) + @test typeof(x) == typeof(z) + @test z == x + z = Matrix(view(x, :, 1:4)) + @test typeof(x) == typeof(z) + @test z == x + + x = [1 1 2 missing] + y = categorical(x) + z = convert(Matrix, y) + @test typeof(x) == typeof(z) + @test z ≅ x + z = Matrix(view(x, :, 1:4)) + @test typeof(x) == typeof(z) + @test z ≅ x end @testset "Array{T} constructors and convert" begin From 7badc9ec6c2f3fe29fb2abd9a02996f2b240c566 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Thu, 31 Jul 2025 11:37:03 +0200 Subject: [PATCH 24/25] Make `levels` return a `CategoricalArray` (#425) Having `levels` preserve the eltype of the input is sometimes useful to write generic code. This is only slightly breaking as the result still compares equal to the previous behavior returning unwrapped values. --- benchmark/benchmarks.jl | 4 +- docs/src/using.md | 18 +++---- ext/CategoricalArraysArrowExt.jl | 2 + ext/CategoricalArraysRecipesBaseExt.jl | 2 +- src/array.jl | 56 +++++++++++---------- src/pool.jl | 29 ++++++----- src/recode.jl | 10 ++-- src/typedefs.jl | 5 +- src/value.jl | 8 +-- test/01_value.jl | 7 ++- test/07_levels.jl | 68 ++++++++++++++++++-------- test/11_array.jl | 2 +- test/12_missingarray.jl | 2 +- test/13_arraycommon.jl | 8 +-- test/14_view.jl | 3 +- 15 files changed, 135 insertions(+), 89 deletions(-) diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index 5c2ae42b..bf12f7c9 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -55,11 +55,11 @@ SUITE["many levels"]["CategoricalArray(::Vector{String})"] = a = rand([@sprintf("id%010d", k) for k in 1:1000], 10000) ca = CategoricalArray(a) -levs = levels(ca) +levs = unwrap.(levels(ca)) SUITE["many levels"]["levels! with original levels"] = @benchmarkable levels!(ca, levs) -levs = reverse(levels(ca)) +levs = reverse(unwrap.(levels(ca))) SUITE["many levels"]["levels! with resorted levels"] = @benchmarkable levels!(ca, levs) diff --git a/docs/src/using.md b/docs/src/using.md index 9790e8cf..24c452b0 100644 --- a/docs/src/using.md +++ b/docs/src/using.md @@ -20,7 +20,7 @@ By default, the levels are lexically sorted, which is clearly not correct in our ```jldoctest using julia> levels(x) -3-element Vector{String}: +3-element CategoricalArray{String,1,UInt32}: "Middle" "Old" "Young" @@ -68,7 +68,7 @@ To get rid of the `"Old"` group, just call the [`droplevels!`](@ref) function: ```jldoctest using julia> levels(x) -3-element Vector{String}: +3-element CategoricalArray{String,1,UInt32}: "Young" "Middle" "Old" @@ -81,7 +81,7 @@ julia> droplevels!(x) "Young" julia> levels(x) -2-element Vector{String}: +2-element CategoricalArray{String,1,UInt32}: "Young" "Middle" @@ -139,7 +139,7 @@ Levels still need to be reordered manually: ```jldoctest using julia> levels(y) -3-element Vector{String}: +3-element CategoricalArray{String,1,UInt32}: "Middle" "Old" "Young" @@ -251,7 +251,7 @@ julia> xy = vcat(x, y) "Middle" julia> levels(xy) -3-element Vector{String}: +3-element CategoricalArray{String,1,UInt32}: "Young" "Middle" "Old" @@ -263,7 +263,7 @@ true Likewise, assigning a `CategoricalValue` from `y` to an entry in `x` expands the levels of `x` with all levels from `y`, *respecting the ordering of levels of both vectors if possible*: ```jldoctest using julia> levels(x) -2-element Vector{String}: +2-element CategoricalArray{String,1,UInt32}: "Middle" "Old" @@ -271,7 +271,7 @@ julia> x[1] = y[1] CategoricalValue{String, UInt32} "Young" (1/2) julia> levels(x) -3-element Vector{String}: +3-element CategoricalArray{String,1,UInt32}: "Young" "Middle" "Old" @@ -296,7 +296,7 @@ julia> ab = vcat(a, b) "c" julia> levels(ab) -3-element Vector{String}: +3-element CategoricalArray{String,1,UInt32}: "a" "b" "c" @@ -320,7 +320,7 @@ julia> ab2 = vcat(a, b) "c" julia> levels(ab2) -3-element Vector{String}: +3-element CategoricalArray{String,1,UInt32}: "a" "b" "c" diff --git a/ext/CategoricalArraysArrowExt.jl b/ext/CategoricalArraysArrowExt.jl index 3e764122..811870d2 100644 --- a/ext/CategoricalArraysArrowExt.jl +++ b/ext/CategoricalArraysArrowExt.jl @@ -7,6 +7,8 @@ import Arrow: ArrowTypes const CATARRAY_ARROWNAME = Symbol("JuliaLang.CategoricalArrays.CategoricalArray") ArrowTypes.arrowname(::Type{<:CategoricalValue}) = CATARRAY_ARROWNAME ArrowTypes.arrowmetadata(::Type{CategoricalValue{T, R}}) where {T, R} = string(R) +ArrowTypes.ArrowType(::Type{<:CategoricalValue{T}}) where {T} = T +ArrowTypes.toarrow(x::CategoricalValue) = unwrap(x) ArrowTypes.arrowname(::Type{Union{<:CategoricalValue, Missing}}) = CATARRAY_ARROWNAME ArrowTypes.arrowmetadata(::Type{Union{CategoricalValue{T, R}, Missing}}) where {T, R} = diff --git a/ext/CategoricalArraysRecipesBaseExt.jl b/ext/CategoricalArraysRecipesBaseExt.jl index 2642f838..656f3e3d 100644 --- a/ext/CategoricalArraysRecipesBaseExt.jl +++ b/ext/CategoricalArraysRecipesBaseExt.jl @@ -9,7 +9,7 @@ else end RecipesBase.@recipe function f(::Type{T}, v::T) where T <: CategoricalValue - level_strings = [map(string, levels(v)); missing] + level_strings = [map(string, CategoricalArrays._levels(v)); missing] ticks --> eachindex(level_strings) v -> ismissing(v) ? length(level_strings) : Int(CategoricalArrays.refcode(v)), i -> level_strings[Int(i)] diff --git a/src/array.jl b/src/array.jl index 8101fb56..4d47e82c 100644 --- a/src/array.jl +++ b/src/array.jl @@ -240,7 +240,7 @@ function CategoricalArray{T, N, R}(A::CategoricalArray{S, N, Q}; catch err err isa LevelsException || rethrow(err) throw(ArgumentError("encountered value(s) not in specified `levels`: " * - "$(setdiff(CategoricalArrays.levels(res), levels))")) + "$(setdiff(_levels(res), levels))")) end end return res @@ -359,18 +359,18 @@ function _convert(::Type{CategoricalArray{T, N, R}}, A::AbstractArray{S, N}; copyto!(res, A) if levels !== nothing - CategoricalArrays.levels(res) == levels || + _levels(res) == levels || throw(ArgumentError("encountered value(s) not in specified `levels`: " * - "$(setdiff(CategoricalArrays.levels(res), levels))")) + "$(setdiff(_levels(res), levels))")) else # if order is defined for level type, automatically apply it L = leveltype(res) if Base.OrderStyle(L) isa Base.Ordered - levels!(res, sort(CategoricalArrays.levels(res))) + levels!(res, sort(_levels(res))) elseif hasmethod(isless, (L, L)) # isless may throw an error, e.g. for AbstractArray{T} of unordered T try - levels!(res, sort(CategoricalArrays.levels(res))) + levels!(res, sort(_levels(res))) catch e e isa MethodError || rethrow(e) end @@ -383,7 +383,7 @@ end # From CategoricalArray (preserve levels, ordering and R) function convert(::Type{CategoricalArray{T, N, R}}, A::CategoricalArray{S, N}) where {S, T, N, R} if length(A.pool) > typemax(R) - throw(LevelsException{T, R}(levels(A)[typemax(R)+1:end])) + throw(LevelsException{T, R}(_levels(A)[typemax(R)+1:end])) end if !(T >: Missing) && S >: Missing && any(iszero, A.refs) @@ -467,7 +467,7 @@ size(A::CategoricalArray) = size(A.refs) Base.IndexStyle(::Type{<:CategoricalArray}) = IndexLinear() function update_refs!(A::CategoricalArray, newlevels::AbstractVector) - oldlevels = levels(A) + oldlevels = _levels(A) levelsmap = similar(A.refs, length(oldlevels)+1) # 0 maps to a missing value levelsmap[1] = 0 @@ -485,7 +485,7 @@ function merge_pools!(A::CatArrOrSub, updaterefs::Bool=true, updatepool::Bool=true) newlevels, ordered = merge_pools(pool(A), pool(B)) - oldlevels = levels(A) + oldlevels = _levels(A) pA = A isa SubArray ? parent(A) : A ordered!(pA, ordered) # If A's levels are an ordered superset of new (merged) pool, no need to recompute refs @@ -544,8 +544,8 @@ function copyto!(dest::CatArrOrSub{T, N, R}, dstart::Integer, # try converting src to dest type to avoid partial copy corruption of dest # in the event that the src cannot be copied into dest - slevs = convert(Vector{T}, levels(src)) - dlevs = levels(dest) + slevs = convert(Vector{T}, _levels(src)) + dlevs = _levels(dest) if eltype(src) >: Missing && !(eltype(dest) >: Missing) && !all(x -> x > 0, srefs) throw(MissingException("cannot copy array with missing values to an array with element type $T")) end @@ -598,7 +598,7 @@ function copyto!(dest::CatArrOrSub{T1, N, R}, dstart::Integer, return invoke(copyto!, Tuple{AbstractArray, Integer, AbstractArray, Integer, Integer}, dest, dstart, src, sstart, n) end - newdestlevs = destlevs = copy(levels(dest)) # copy since we need original levels below + newdestlevs = destlevs = copy(_levels(dest)) # copy since we need original levels below srclevsnm = T2 >: Missing ? setdiff(srclevs, [missing]) : srclevs if !(srclevsnm ⊆ destlevs) # if order is defined for level type, automatically apply it @@ -708,7 +708,7 @@ While this will reduce memory use, this function is type-unstable, which can aff performance inside the function where the call is made. Therefore, use it with caution. """ function compress(A::CategoricalArray{T, N}) where {T, N} - R = reftype(length(levels(A.pool))) + R = reftype(length(_levels(A.pool))) convert(CategoricalArray{T, N, R}, A) end @@ -726,11 +726,11 @@ decompress(A::CategoricalArray{T, N}) where {T, N} = convert(CategoricalArray{T, N, DefaultRefType}, A) function vcat(A::CategoricalArray...) - ordered = any(isordered, A) && all(a->isordered(a) || isempty(levels(a)), A) - newlevels, ordered = mergelevels(ordered, map(levels, A)...) + ordered = any(isordered, A) && all(a->isordered(a) || isempty(_levels(a)), A) + newlevels, ordered = mergelevels(ordered, map(_levels, A)...) refsvec = map(A) do a - ii = convert(Vector{Int}, indexin(levels(a.pool), newlevels)) + ii = convert(Vector{Int}, indexin(_levels(a.pool), newlevels)) [x==0 ? 0 : ii[x] for x in a.refs]::Array{Int,ndims(a)} end @@ -768,23 +768,25 @@ This may include levels which do not actually appear in the data `missing` will be included only if it appears in the data and `skipmissing=false` is passed. -The returned vector is an internal field of `x` which must not be mutated +The returned vector is owned by `x` and must not be mutated as doing so would corrupt it. """ -@inline function DataAPI.levels(A::CatArrOrSub{T}; skipmissing::Bool=true) where T +@inline function DataAPI.levels(A::CatArrOrSub; skipmissing::Bool=true) if eltype(A) >: Missing && !skipmissing if any(==(0), refs(A)) - T[levels(pool(A)); missing] + eltype(A)[levels(pool(A)); missing] else - convert(Vector{T}, levels(pool(A))) + levels_missing(pool(A)) end else levels(pool(A)) end end +_levels(A::CatArrOrSub) = _levels(pool(A)) + """ - levels!(A::CategoricalArray, newlevels::Vector; allowmissing::Bool=false) + levels!(A::CategoricalArray, newlevels::AbstractVector; allowmissing::Bool=false) Set the levels categorical array `A`. The order of appearance of levels will be respected by [`levels`](@ref DataAPI.levels), which may affect display of results in some operations; if `A` is @@ -798,7 +800,7 @@ Else, `newlevels` must include all levels which appear in the data. """ function levels!(A::CategoricalArray{T, N, R}, newlevels::AbstractVector; allowmissing::Bool=false) where {T, N, R} - (levels(A) == newlevels) && return A # nothing to do + (_levels(A) == newlevels) && return A # nothing to do # map each new level to its ref code newlv2ref = Dict{eltype(newlevels), Int}() @@ -813,7 +815,7 @@ function levels!(A::CategoricalArray{T, N, R}, newlevels::AbstractVector; end # map each old ref code to new ref code (or 0 if no such level) - oldlevels = levels(pool(A)) + oldlevels = _levels(pool(A)) oldref2newref = fill(0, length(oldlevels) + 1) for (i, lv) in enumerate(oldlevels) oldref2newref[i + 1] = get(newlv2ref, lv, 0) @@ -874,7 +876,7 @@ end function _uniquerefs(A::CatArrOrSub{T}) where T arefs = refs(A) res = similar(arefs, 0) - nlevels = length(levels(A)) + nlevels = length(_levels(A)) maxunique = nlevels + (T >: Missing ? 1 : 0) seen = fill(false, nlevels + 1) # always +1 for 0 (missing ref) @inbounds for ref in arefs @@ -907,7 +909,7 @@ returned by [`levels`](@ref DataAPI.levels)). """ function droplevels!(A::CategoricalArray) arefs = refs(A) - nlevels = length(levels(A)) + 1 # +1 for missing + nlevels = length(_levels(A)) + 1 # +1 for missing seen = fill(false, nlevels) seen[1] = true # assume that missing is always observed to simplify checks nseen = 1 @@ -920,7 +922,7 @@ function droplevels!(A::CategoricalArray) end # replace the pool - A.pool = typeof(pool(A))(@inbounds(levels(A)[view(seen, 2:nlevels)]), isordered(A)) + A.pool = typeof(pool(A))(@inbounds(_levels(A)[view(seen, 2:nlevels)]), isordered(A)) # recode refs to keep only the seen ones (optimized version of update_refs!()) seen[1] = false # to start levelsmap from 0 levelsmap = cumsum(seen) @@ -1037,7 +1039,7 @@ end ordered=_isordered(A), compress::Bool=false) where {T, N, R} # @inline is needed so that return type is inferred when compress is not provided - RefType = compress ? reftype(length(CategoricalArrays.levels(A))) : R + RefType = compress ? reftype(length(_levels(A))) : R CategoricalArray{T, N, RefType}(A, levels=levels, ordered=ordered) end @@ -1050,7 +1052,7 @@ function in(x::CategoricalValue, y::CategoricalArray{T, N, R}) where {T, N, R} if x.pool === y.pool return refcode(x) in y.refs else - ref = get(y.pool, levels(x.pool)[refcode(x)], zero(R)) + ref = get(y.pool, _levels(x.pool)[refcode(x)], zero(R)) return ref != 0 ? ref in y.refs : false end end diff --git a/src/pool.jl b/src/pool.jl index 9753a76d..2df7e345 100644 --- a/src/pool.jl +++ b/src/pool.jl @@ -21,8 +21,8 @@ Base.convert(::Type{CategoricalPool{S}}, pool::CategoricalPool{T, R}) where {S, convert(CategoricalPool{S, R}, pool) function Base.convert(::Type{CategoricalPool{T, R}}, pool::CategoricalPool) where {T, R <: Integer} - if length(levels(pool)) > typemax(R) - throw(LevelsException{T, R}(levels(pool)[typemax(R)+1:end])) + if length(pool.levels) > typemax(R) + throw(LevelsException{T, R}(pool.levels[typemax(R)+1:end])) end levelsT = convert(Vector{T}, pool.levels) @@ -37,10 +37,10 @@ Base.copy(pool::CategoricalPool{T, R}) where {T, R} = function Base.show(io::IO, pool::CategoricalPool{T, R}) where {T, R} @static if VERSION >= v"1.6.0" @printf(io, "%s{%s, %s}([%s])", CategoricalPool, T, R, - join(map(repr, levels(pool)), ", ")) + join(map(repr, pool.levels), ", ")) else @printf(io, "%s{%s,%s}([%s])", CategoricalPool, T, R, - join(map(repr, levels(pool)), ", ")) + join(map(repr, pool.levels), ", ")) end pool.ordered && print(io, " with ordered levels") @@ -65,6 +65,7 @@ it doesn't do this itself to avoid doing a dict lookup twice i = R(n + 1) push!(pool.levels, x) + push!(pool.levelsinds, i) pool_hash = pool.hash if pool_hash !== nothing pool.hash = hash(x, pool_hash) @@ -185,10 +186,10 @@ function merge_pools(a::CategoricalPool{T}, b::CategoricalPool) where {T} newlevs = T[] ordered = isordered(a) elseif length(a) == 0 - newlevs = Vector{T}(levels(b)) + newlevs = Vector{T}(b.levels) ordered = isordered(b) elseif length(b) == 0 - newlevs = copy(levels(a)) + newlevs = copy(a.levels) ordered = isordered(a) else ordered = isordered(a) && (isordered(b) || b ⊆ a) @@ -200,7 +201,7 @@ end @inline function Base.hash(pool::CategoricalPool, h::UInt) if pool.hash === nothing - pool.hash = hashlevels(levels(pool)) + pool.hash = hashlevels(pool.levels) end hash(pool.hash, h) end @@ -246,9 +247,9 @@ end # Contrary to the CategoricalArray one, this method only allows adding new levels at the end # so that existing CategoricalValue objects still point to the same value -function levels!(pool::CategoricalPool{S, R}, newlevels::Vector; +function levels!(pool::CategoricalPool{S, R}, newlevels::AbstractVector; checkunique::Bool=true) where {S, R} - levs = convert(Vector{S}, newlevels) + levs = newlevels isa CategoricalVector{S} ? newlevels : convert(Vector{S}, newlevels) if checkunique && !allunique(levs) throw(ArgumentError(string("duplicated levels found in levs: ", join(unique(filter(x->sum(levs.==x)>1, levs)), ", ")))) @@ -259,24 +260,30 @@ function levels!(pool::CategoricalPool{S, R}, newlevels::Vector; n = length(levs) if n > typemax(R) - throw(LevelsException{S, R}(setdiff(levs, levels(pool))[typemax(R)-length(levels(pool))+1:end])) + throw(LevelsException{S, R}(setdiff(levs, pool.levels)[typemax(R)-length(pool.levels)+1:end])) end empty!(pool.invindex) resize!(pool.levels, n) + resize!(pool.levelsinds, n) pool.hash = nothing pool.equalto = C_NULL pool.subsetof = C_NULL for i in 1:n v = levs[i] pool.levels[i] = v + pool.levelsinds[i] = i pool.invindex[v] = i end return pool end -DataAPI.levels(pool::CategoricalPool) = pool.levels +DataAPI.levels(pool::CategoricalPool{T}) where {T} = + CategoricalVector{T}(pool.levelsinds, pool) +levels_missing(pool::CategoricalPool{T}) where {T} = + CategoricalVector{Union{T, Missing}}(pool.levelsinds, pool) +_levels(pool::CategoricalPool) = pool.levels isordered(pool::CategoricalPool) = pool.ordered ordered!(pool::CategoricalPool, ordered) = (pool.ordered = ordered; pool) diff --git a/src/recode.jl b/src/recode.jl index 141f9967..ff258e60 100644 --- a/src/recode.jl +++ b/src/recode.jl @@ -111,7 +111,7 @@ function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, defau levels!(dest.pool, filter!(!ismissing, unique(vals))) # In the absence of duplicated recoded values, we do not need to lookup the reference # for each pair in the loop, which is more efficient (with loop unswitching) - dupvals = length(vals) != length(levels(dest.pool)) + dupvals = length(vals) != length(_levels(dest.pool)) drefs = dest.refs pairmap = [ismissing(v) ? zero(R) : get(dest.pool, v) for v in vals] @@ -150,7 +150,7 @@ function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, defau # Put existing levels first, and sort them if possible # for consistency with CategoricalArray - oldlevels = setdiff(levels(dest), vals) + oldlevels = setdiff(_levels(dest), vals) filter!(!ismissing, oldlevels) L = eltype(oldlevels) if Base.OrderStyle(L) isa Base.Ordered @@ -163,7 +163,7 @@ function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, defau e isa MethodError || rethrow(e) end end - levels!(dest, union(oldlevels, levels(dest))) + levels!(dest, union(oldlevels, _levels(dest))) dest end @@ -174,7 +174,7 @@ function _recode!(dest::CategoricalArray{T, N, R}, src::CategoricalArray, vals = T[p.second for p in pairs] if default === nothing - srclevels = levels(src) + srclevels = _levels(src) # Remove recoded levels as they won't appear in result keptlevels = Vector{T}(undef, 0) @@ -201,7 +201,7 @@ function _recode!(dest::CategoricalArray{T, N, R}, src::CategoricalArray, ordered = false end - srclevels = src.pool === dest.pool ? copy(levels(src.pool)) : levels(src.pool) + srclevels = src.pool === dest.pool ? copy(_levels(src.pool)) : _levels(src.pool) if length(levs) > length(srclevels) && view(levs, 1:length(srclevels)) == srclevels levels!(dest.pool, levs) else diff --git a/src/typedefs.jl b/src/typedefs.jl index 0f9aa414..238bb995 100644 --- a/src/typedefs.jl +++ b/src/typedefs.jl @@ -8,6 +8,7 @@ const SupportedTypes = Union{AbstractString, AbstractChar, Number} # * `R` integer type for referencing category levels mutable struct CategoricalPool{T <: SupportedTypes, R <: Integer} levels::Vector{T} # category levels ordered by their reference codes + levelsinds::Vector{R} # set to 1:length(levels), used by `levels(p)` invindex::Dict{T, R} # map from category levels to their reference codes ordered::Bool # whether levels can be compared using < hash::Union{UInt, Nothing} # hash of levels @@ -45,8 +46,8 @@ mutable struct CategoricalPool{T <: SupportedTypes, R <: Integer} invindex::Dict{T, R}, ordered::Bool, hash::Union{UInt, Nothing}=nothing) where {T, R} - pool = new(levels, invindex, ordered, hash, C_NULL, C_NULL) - return pool + return new(levels, 1:length(levels), invindex, + ordered, hash, C_NULL, C_NULL) end end diff --git a/src/value.jl b/src/value.jl index ae962adb..a1633204 100644 --- a/src/value.jl +++ b/src/value.jl @@ -27,6 +27,8 @@ reftype(x::Any) = reftype(typeof(x)) pool(x::CategoricalValue) = x.pool refcode(x::CategoricalValue) = x.ref isordered(x::CategoricalValue) = isordered(x.pool) +DataAPI.levels(x::CategoricalValue) = levels(pool(x)) +_levels(x::CategoricalValue) = _levels(pool(x)) # extract the type of the original value from array eltype `T` unwrap_catvaluetype(::Type{T}) where {T} = T @@ -42,7 +44,7 @@ unwrap_catvaluetype(::Type{T}) where {T <: CategoricalValue} = leveltype(T) Get the value wrapped by categorical value `x`. If `x` is `Missing` return `missing`. """ -DataAPI.unwrap(x::CategoricalValue) = levels(x)[refcode(x)] +DataAPI.unwrap(x::CategoricalValue) = _levels(x)[refcode(x)] """ levelcode(x::CategoricalValue) @@ -59,10 +61,8 @@ Return `missing`. """ levelcode(x::Missing) = missing -DataAPI.levels(x::CategoricalValue) = levels(pool(x)) - function cat_promote_type(::Type{S}, ::Type{T}) where {S, T} - U = promote_type(S, T) + U = promote_type(unwrap_catvaluetype(S), unwrap_catvaluetype(T)) U <: Union{SupportedTypes, Missing} ? U : typeintersect(Union{SupportedTypes, Missing}, Union{S, T}) end diff --git a/test/01_value.jl b/test/01_value.jl index 39f58b67..8c60ae7f 100644 --- a/test/01_value.jl +++ b/test/01_value.jl @@ -22,6 +22,8 @@ end for i in 1:3 x = CategoricalValue(pool, i) + @test levels(x) == levels(pool) + @test levels(x) isa CategoricalVector{String, UInt32} @test leveltype(x) === String @test leveltype(typeof(x)) === String @test reftype(x) === DefaultRefType @@ -48,6 +50,8 @@ end for i in 1:3 x = CategoricalValue(pool, i) + @test levels(x) == levels(pool) + @test levels(x) isa CategoricalVector{String, UInt8} @test leveltype(x) === String @test leveltype(typeof(x)) === String @test reftype(x) === UInt8 @@ -68,7 +72,8 @@ end for x in (CategoricalValue(pool, 1), arr, view(arr, 2:3)) for (i, v) in enumerate(levels(pool)) @test CategoricalValue(v, x) === - CategoricalValue(float(v), x) === + CategoricalValue(unwrap(v), x) === + CategoricalValue(float(unwrap(v)), x) === CategoricalValue(CategoricalValue(pool, i), x) === CategoricalValue(pool, i) end diff --git a/test/07_levels.jl b/test/07_levels.jl index 25c54be0..b54e4d52 100644 --- a/test/07_levels.jl +++ b/test/07_levels.jl @@ -1,15 +1,16 @@ module TestLevels using Test using CategoricalArrays -using CategoricalArrays: DefaultRefType, levels!, hashlevels +using CategoricalArrays: DefaultRefType, levels!, hashlevels, _levels @testset "CategoricalPool{Int} updates levels and order correctly" begin pool = CategoricalPool([2, 1, 3]) - @test isa(levels(pool), Vector{Int}) + @test isa(levels(pool), CategoricalVector{Int, DefaultRefType}) @test length(pool) === 3 - @test levels(pool) == [2, 1, 3] - @test all([levels(CategoricalValue(pool, i)) for i in 1:3] .=== Ref(levels(pool))) + @test levels(pool) == _levels(pool) == [2, 1, 3] + @test all([levels(CategoricalValue(pool, i)) for i in 1:3] .== Ref(levels(pool))) + @test pool.levelsinds == 1:3 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3) @test pool.hash === nothing @test pool.equalto == C_NULL @@ -20,7 +21,8 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test isa(pool.levels, Vector{Int}) @test length(pool) === 4 - @test levels(pool) == [2, 1, 3, 4] + @test levels(pool) == _levels(pool) == [2, 1, 3, 4] + @test pool.levelsinds == 1:4 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4) @test pool.hash === nothing @test pool.equalto == C_NULL @@ -34,7 +36,8 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test isa(pool.levels, Vector{Int}) @test length(pool) === 5 - @test levels(pool) == [2, 1, 3, 4, 0] + @test levels(pool) == _levels(pool) == [2, 1, 3, 4, 0] + @test pool.levelsinds == 1:5 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5) @test pool.hash === nothing @test pool.equalto == C_NULL @@ -48,7 +51,8 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test isa(pool.levels, Vector{Int}) @test length(pool) === 7 - @test levels(pool) == [2, 1, 3, 4, 0, 10, 11] + @test levels(pool) == _levels(pool) == [2, 1, 3, 4, 0, 10, 11] + @test pool.levelsinds == 1:7 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7) @test pool.hash === nothing @test pool.equalto == C_NULL @@ -64,7 +68,8 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test isa(pool.levels, Vector{Int}) @test length(pool) === 9 - @test levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13] + @test levels(pool) == _levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13] + @test pool.levelsinds == 1:9 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7, 12=>8, 13=>9) @test pool.hash === nothing @test pool.equalto == C_NULL @@ -84,15 +89,10 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels # Adding levels while preserving existing ones levs = [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14] @test levels!(pool, levs) === pool - @test levels(pool) == levs - @test levels(pool) !== levs - @test pool.hash === nothing - @test pool.equalto == C_NULL - @test pool.subsetof == C_NULL - + @test levels(pool) == _levels(pool) == levs + @test pool.levels !== levs @test isa(pool.levels, Vector{Int}) - @test length(pool) === 11 - @test levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14] + @test pool.levelsinds == 1:11 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7, 12=>8, 13=>9, 15=>10, 14=>11) @test pool.hash === nothing @@ -109,7 +109,10 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test isa(pool.levels, Vector{Int}) @test length(pool) == 12 - @test levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20] + @test levels(pool) == + _levels(pool) == + [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20] + @test pool.levelsinds == 1:12 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7, 12=>8, 13=>9, 15=>10, 14=>11, 20=>12) @test pool.hash === nothing @@ -128,7 +131,10 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test isa(pool.levels, Vector{Int}) @test length(pool) == 14 - @test levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20, 100, 99] + @test levels(pool) == + _levels(pool) == + [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20, 100, 99] + @test pool.levelsinds == 1:14 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7, 12=>8, 13=>9, 15=>10, 14=>11, 20=>12, 100=>13, 99=>14) @test pool.hash === nothing @@ -143,7 +149,10 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test isa(pool.levels, Vector{Int}) @test length(pool) == 14 - @test levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20, 100, 99] + @test levels(pool) == + _levels(pool) == + [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20, 100, 99] + @test pool.levelsinds == 1:14 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7, 12=>8, 13=>9, 15=>10, 14=>11, 20=>12, 100=>13, 99=>14) @test pool.hash === CategoricalArrays.hashlevels(levels(pool)) @@ -155,7 +164,10 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test isa(pool.levels, Vector{Int}) @test length(pool) == 14 - @test levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20, 100, 99] + @test levels(pool) == + _levels(pool) == + [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20, 100, 99] + @test pool.levelsinds == 1:14 @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7, 12=>8, 13=>9, 15=>10, 14=>11, 20=>12, 100=>13, 99=>14) @test pool.hash === CategoricalArrays.hashlevels(levels(pool)) @@ -178,6 +190,22 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels @test !isordered(p2) end +@testset "levels!(::CategoricalPool, ::CategoricalVector)" begin + pool = CategoricalPool([2, 1, 3]) + levels!(pool, categorical([2, 1, 3, 4])) + @test levels(pool) == [2, 1, 3, 4] + + pool = CategoricalPool([2, 1, 3]) + levels!(pool, categorical([2.0, 1.0, 3.0, 4.0])) + @test levels(pool) == [2, 1, 3, 4] + + pool = CategoricalPool([2, 1, 3]) + @test_throws ArgumentError levels!(pool, categorical([2, 2, 1, 3, 4])) + + pool = CategoricalPool([2, 1, 3]) + @test_throws ArgumentError levels!(pool, categorical(1:3)) +end + @testset "overflow of reftype is detected and doesn't corrupt levels" begin res = @test_throws LevelsException{Int, UInt8} CategoricalPool{Int, UInt8}(collect(256:-1:1)) @test res.value.levels == [1] diff --git a/test/11_array.jl b/test/11_array.jl index b474cfe1..4f332640 100644 --- a/test/11_array.jl +++ b/test/11_array.jl @@ -746,7 +746,7 @@ end @test y == unique(x) x = CategoricalArray(String[]) - @test isa(levels(x), Vector{String}) && isempty(levels(x)) + @test isa(levels(x), CategoricalVector{String, DefaultRefType}) && isempty(levels(x)) @test isa(unique(x), typeof(x)) && isempty(unique(x)) @test levels!(x, ["Young", "Middle", "Old"]) === x @test levels(x) == ["Young", "Middle", "Old"] diff --git a/test/12_missingarray.jl b/test/12_missingarray.jl index a2204e40..5c2ed3a9 100644 --- a/test/12_missingarray.jl +++ b/test/12_missingarray.jl @@ -1160,7 +1160,7 @@ end @test unique(x) ≅ ["Old", "Young", "Middle", missing] x = CategoricalArray((Union{String, Missing})[missing]) - @test isa(levels(x), Vector{String}) && isempty(levels(x)) + @test isa(levels(x), CategoricalVector{String, DefaultRefType}) && isempty(levels(x)) @test unique(x) ≅ [missing] @test levels!(x, ["Young", "Middle", "Old"]) === x @test levels(x) == ["Young", "Middle", "Old"] diff --git a/test/13_arraycommon.jl b/test/13_arraycommon.jl index 2cf3ff6b..e95be673 100644 --- a/test/13_arraycommon.jl +++ b/test/13_arraycommon.jl @@ -2424,18 +2424,18 @@ end view(categorical(Union{String, Missing}[missing, "b", "a"], levels=["b", "c", "a"]), 2:3)) @test @inferred(levels(x)) == ["b", "c", "a"] @test levels(x, skipmissing=true) == ["b", "c", "a"] - @test levels(x, skipmissing=true) isa Vector{String} + @test levels(x, skipmissing=true) isa CategoricalVector{String} @test levels(x, skipmissing=false) == ["b", "c", "a"] - @test levels(x, skipmissing=false) isa Vector{Union{String, Missing}} + @test levels(x, skipmissing=false) isa CategoricalVector{Union{String, Missing}} end for x in (categorical(Union{String, Missing}["a", "b", missing], levels=["b", "c", "a"]), view(categorical(Union{String, Missing}["c", "b", missing], levels=["b", "c", "a"]), 2:3)) @test @inferred(levels(x)) == ["b", "c", "a"] @test levels(x, skipmissing=true) == ["b", "c", "a"] - @test levels(x, skipmissing=true) isa Vector{String} + @test levels(x, skipmissing=true) isa CategoricalVector{String} @test levels(x, skipmissing=false) ≅ ["b", "c", "a", missing] - @test levels(x, skipmissing=false) isa Vector{Union{String, Missing}} + @test levels(x, skipmissing=false) isa CategoricalVector{Union{String, Missing}} end end diff --git a/test/14_view.jl b/test/14_view.jl index 79b20812..11853853 100644 --- a/test/14_view.jl +++ b/test/14_view.jl @@ -11,7 +11,8 @@ const ≅ = isequal x = CategoricalArray{Union{T, eltype(a)}}(a, ordered=order) v = view(x, inds) - @test levels(v) === levels(x) + @test levels(x) isa CategoricalVector{nonmissingtype(eltype(a))} + @test levels(v) == levels(x) @test unique(v) == (ndims(v) > 0 ? unique(a[inds]) : [a[inds]]) @test isordered(v) === isordered(x) end From 49e200af3edea6fc528feb10a8a3a88f8b88f79d Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Thu, 31 Jul 2025 12:42:10 +0200 Subject: [PATCH 25/25] Release version 1.0 (#426) --- NEWS.md | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ Project.toml | 2 +- 2 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 NEWS.md diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 00000000..61842496 --- /dev/null +++ b/NEWS.md @@ -0,0 +1,48 @@ +# CategoricalArrays.jl v1.0.0 Release Notes + +## Breaking changes + +* `unique(::CategoricalArray)` and `levels(::CategoricalArray)` return + a `CategoricalArray` instead of unwrapping values, consistent with + `unique(::AbstractArray)` in Base and `levels(::AbstractArray)` in DataAPI + ([#358](https://github.com/JuliaData/CategoricalArrays.jl/pull/358), + [#425](https://github.com/JuliaData/CategoricalArrays.jl/pull/425)). + +* `cut` always closes the last interval on the right + ([#409](https://github.com/JuliaData/CategoricalArrays.jl/pull/409)). + +* `cut(x, breaks)` rounds breaks to generate shorter labels + ([#422](https://github.com/JuliaData/CategoricalArrays.jl/pull/422)). + +* `cut(x, ngroups)` takes breaks from actual values instead of using + quantile estimates which are generally longer + ([#416](https://github.com/JuliaData/CategoricalArrays.jl/pull/416)) + This only changes group labels, not their contents. + +* `T(::CategoricalArray{U})` and `convert(T, ::CategoricalArray{U})` + now consistently return an `Array{U}` for `T` in `Array`, `Vector`, `Matrix`. + This avoids creating `Array{<:CategoricalValue}` objects unless explicitly requested + ([#420](https://github.com/JuliaData/CategoricalArrays.jl/pull/420)). + + +* All deprecations have been removed + ([#419](https://github.com/JuliaData/CategoricalArrays.jl/pull/419)). + +## New features + +* Support reading from and writing to Arrow files + ([#415](https://github.com/JuliaData/CategoricalArrays.jl/pull/415)). + +* Improve performance of `recode` + ([#407](https://github.com/JuliaData/CategoricalArrays.jl/pull/407)). + +* Support weighted quantiles in `cut` + ([#423](https://github.com/JuliaData/CategoricalArrays.jl/pull/423)). + +## Bug fixes + +* Fix performance regression on Julia 1.11 and above + ([#418](https://github.com/JuliaData/CategoricalArrays.jl/pull/418)). + +* Fix `cut` corner cases with duplicated breaks + ([#410](https://github.com/JuliaData/CategoricalArrays.jl/pull/410)). diff --git a/Project.toml b/Project.toml index a9262e93..83d5ba30 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "CategoricalArrays" uuid = "324d7699-5711-5eae-9e2f-1d82baa6b597" -version = "0.10.8" +version = "1.0.0" [deps] Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"