From 246e8f0c86fbc08a664266c9e0edffbc24727298 Mon Sep 17 00:00:00 2001
From: Andreas Noack <andreas@noack.dk>
Date: Wed, 10 May 2023 13:27:46 +0200
Subject: [PATCH 01/25] Update CompatHelper.yml

---
 .github/workflows/CompatHelper.yml | 47 ++++++++++++++++++++----------
 1 file changed, 31 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml
index 7344a549..8d889a9d 100644
--- a/.github/workflows/CompatHelper.yml
+++ b/.github/workflows/CompatHelper.yml
@@ -1,24 +1,39 @@
 name: CompatHelper
-
 on:
   schedule:
-    - cron: '00 00 * * *'
-
+    - cron: 0 0 * * *
+  workflow_dispatch:
+permissions:
+  contents: write
+  pull-requests: write
 jobs:
-  build:
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        julia-version: [1.2.0]
-        julia-arch: [x86]
-        os: [ubuntu-latest]
+  CompatHelper:
+    runs-on: ubuntu-latest
     steps:
-      - uses: julia-actions/setup-julia@latest
+      - name: Check if Julia is already available in the PATH
+        id: julia_in_path
+        run: which julia
+        continue-on-error: true
+      - name: Install Julia, but only if it is not already available in the PATH
+        uses: julia-actions/setup-julia@v1
         with:
-          version: ${{ matrix.julia-version }}
-      - name: Pkg.add("CompatHelper")
-        run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
-      - name: CompatHelper.main()
+          version: '1'
+          arch: ${{ runner.arch }}
+        if: steps.julia_in_path.outcome != 'success'
+      - name: "Install CompatHelper"
+        run: |
+          import Pkg
+          name = "CompatHelper"
+          uuid = "aa819f21-2bde-4658-8897-bab36330d9b7"
+          version = "3"
+          Pkg.add(; name, uuid, version)
+        shell: julia --color=yes {0}
+      - name: "Run CompatHelper"
+        run: |
+          import CompatHelper
+          CompatHelper.main()
+        shell: julia --color=yes {0}
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: julia -e 'using CompatHelper; CompatHelper.main()'
+          COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }}
+          # COMPATHELPER_PRIV: ${{ secrets.COMPATHELPER_PRIV }}

From d61d91129903db0171b5da642ca6d2cd7da650a8 Mon Sep 17 00:00:00 2001
From: Matthijs Cox <79519355+matthijscox-asml@users.noreply.github.com>
Date: Fri, 27 Dec 2024 10:12:13 +0100
Subject: [PATCH 02/25] remove JET runtime dispatch error (#408)

---
 src/pool.jl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/pool.jl b/src/pool.jl
index 1018cfee..0ece21ce 100644
--- a/src/pool.jl
+++ b/src/pool.jl
@@ -67,8 +67,9 @@ it doesn't do this itself to avoid doing a dict lookup twice
 
     i = R(n + 1)
     push!(pool.levels, x)
-    if pool.hash !== nothing
-        pool.hash = hash(x, pool.hash)
+    pool_hash = pool.hash
+    if pool_hash !== nothing
+        pool.hash = hash(x, pool_hash)
     end
     pool.equalto = C_NULL
     pool.subsetof = C_NULL

From bbac8dc79ae02d89d8728d0c8155afc7e8d9eedf Mon Sep 17 00:00:00 2001
From: Milan Bouchet-Valat <nalimilan@club.fr>
Date: Fri, 27 Dec 2024 22:57:40 +0100
Subject: [PATCH 03/25] Make `cut` close last interval on the right (#409)

This is much more useful, though slightly breaking.
---
 src/extras.jl         | 12 +++++-----
 test/15_extras.jl     | 52 ++++++++++++++++++++++++++-----------------
 test/17_deprecated.jl |  2 +-
 3 files changed, 37 insertions(+), 29 deletions(-)

diff --git a/src/extras.jl b/src/extras.jl
index 137875b8..f536f06f 100644
--- a/src/extras.jl
+++ b/src/extras.jl
@@ -11,9 +11,9 @@ function fill_refs!(refs::AbstractArray, X::AbstractArray,
 
         if ismissing(x)
             refs[i] = 0
-        elseif extend === true && x == upper
+        elseif x == upper
             refs[i] = n-1
-        elseif extend !== true && !(lower <= x < upper)
+        elseif extend !== true && !(lower <= x <= upper)
             extend === missing ||
                 throw(ArgumentError("value $x (at index $i) does not fall inside the breaks: " *
                                     "adapt them manually, or pass extend=true or extend=missing"))
@@ -41,8 +41,7 @@ Cut a numeric array into intervals at values `breaks`
 and return an ordered `CategoricalArray` indicating
 the interval into which each entry falls. Intervals are of the form `[lower, upper)`,
 i.e. the lower bound is included and the upper bound is excluded, except
-if `extend=true` the last interval, which is then closed on both ends,
-i.e. `[lower, upper]`.
+the last interval, which is closed on both ends, i.e. `[lower, upper]`.
 
 If `x` accepts missing values (i.e. `eltype(x) >: Missing`) the returned array will
 also accept them.
@@ -50,8 +49,7 @@ also accept them.
 # Keyword arguments
 * `extend::Union{Bool, Missing}=false`: when `false`, an error is raised if some values
   in `x` fall outside of the breaks; when `true`, breaks are automatically added to include
-  all values in `x`, and the upper bound is included in the last interval; when `missing`,
-  values outside of the breaks generate `missing` entries.
+  all values in `x`; when `missing`, values outside of the breaks generate `missing` entries.
 * `labels::Union{AbstractVector, Function}`: a vector of strings, characters
   or numbers giving the names to use for
   the intervals; or a function `f(from, to, i; leftclosed, rightclosed)` that generates
@@ -200,7 +198,7 @@ function _cut(x::AbstractArray{T, N}, breaks::AbstractVector,
         end
         levs[end] = labels(from[end], to[end], n-1,
                            leftclosed=breaks[end-1] != breaks[end],
-                           rightclosed=coalesce(extend, false))
+                           rightclosed=true)
     else
         length(labels) == n-1 ||
             throw(ArgumentError("labels must be of length $(n-1), but got length $(length(labels))"))
diff --git a/test/15_extras.jl b/test/15_extras.jl
index 472885a1..14fb4352 100644
--- a/test/15_extras.jl
+++ b/test/15_extras.jl
@@ -6,27 +6,37 @@ const ≅ = isequal
 
 @testset "cut($(Union{Int, T})[...])" for T in (Union{}, Missing)
     x = @inferred cut(Vector{Union{Int, T}}([2, 3, 5]), [1, 3, 6])
-    @test x == ["[1, 3)", "[3, 6)", "[3, 6)"]
+    @test x == ["[1, 3)", "[3, 6]", "[3, 6]"]
     @test isa(x, CategoricalVector{Union{String, T}})
     @test isordered(x)
-    @test levels(x) == ["[1, 3)", "[3, 6)"]
+    @test levels(x) == ["[1, 3)", "[3, 6]"]
+
+    @test cut(Vector{Union{T, Int}}([2, 3, 5]), [2, 5], extend=false) ==
+        ["[2, 5]", "[2, 5]", "[2, 5]"]
 
     err = @test_throws ArgumentError cut(Vector{Union{T, Int}}([2, 3, 5]), [3, 6])
     @test err.value.msg == "value 2 (at index 1) does not fall inside the breaks: adapt them manually, or pass extend=true or extend=missing"
 
 
-    err = @test_throws ArgumentError cut(Vector{Union{T, Int}}([2, 3, 5]), [2, 5])
-    @test err.value.msg == "value 5 (at index 3) does not fall inside the breaks: adapt them manually, or pass extend=true or extend=missing"
-
     if T === Missing
         x = @inferred cut(Vector{Union{T, Int}}([2, 3, 5]), [2, 5], extend=missing)
     else
         x = cut(Vector{Union{T, Int}}([2, 3, 5]), [2, 5], extend=missing)
     end
-    @test x ≅ ["[2, 5)", "[2, 5)", missing]
+    @test x ≅ ["[2, 5]", "[2, 5]", "[2, 5]"]
     @test isa(x, CategoricalVector{Union{String, Missing}})
     @test isordered(x)
-    @test levels(x) == ["[2, 5)"]
+    @test levels(x) == ["[2, 5]"]
+
+    if T === Missing
+        x = @inferred cut(Vector{Union{T, Int}}([2, 3, 6]), [2, 5], extend=missing)
+    else
+        x = cut(Vector{Union{T, Int}}([2, 3, 6]), [2, 5], extend=missing)
+    end
+    @test x ≅ ["[2, 5]", "[2, 5]", missing]
+    @test isa(x, CategoricalVector{Union{String, Missing}})
+    @test isordered(x)
+    @test levels(x) == ["[2, 5]"]
 
     x = @inferred cut(Vector{Union{T, Int}}([2, 3, 5]), [3, 6], extend=true)
     @test x == ["[2, 3)", "[3, 6]", "[3, 6]"]
@@ -40,10 +50,10 @@ const ≅ = isequal
     @test levels(x) == ["[2, 3)", "[3, 6]"]
 
     x = @inferred cut(Vector{Union{T, Int}}([1, 2, 4]), [1, 3, 6])
-    @test x == ["[1, 3)", "[1, 3)", "[3, 6)"]
+    @test x == ["[1, 3)", "[1, 3)", "[3, 6]"]
     @test isa(x, CategoricalVector{Union{String, T}})
     @test isordered(x)
-    @test levels(x) == ["[1, 3)", "[3, 6)"]
+    @test levels(x) == ["[1, 3)", "[3, 6]"]
 
     x = @inferred cut(Vector{Union{T, Int}}([1, 2, 4]), [3, 6], extend=true)
     @test x == ["[1, 3)", "[1, 3)", "[3, 6]"]
@@ -67,10 +77,10 @@ const ≅ = isequal
     breaks = [18, 25, 35, 60, 100]
     x = @inferred cut(Vector{Union{T, Int}}(ages), breaks)
     @test x == ["[18, 25)", "[18, 25)", "[25, 35)", "[25, 35)", "[18, 25)", "[18, 25)",
-                "[35, 60)", "[25, 35)", "[60, 100)", "[35, 60)", "[35, 60)", "[25, 35)"]
+                "[35, 60)", "[25, 35)", "[60, 100]", "[35, 60)", "[35, 60)", "[25, 35)"]
     @test isa(x, CategoricalVector{Union{String, T}})
     @test isordered(x)
-    @test levels(x) == ["[18, 25)", "[25, 35)", "[35, 60)", "[60, 100)"]
+    @test levels(x) == ["[18, 25)", "[25, 35)", "[35, 60)", "[60, 100]"]
 
     breaks = [1, 6, 3] # Unsorted breaks
     labels = ["b", "a"] # Differs from lexical ordering
@@ -83,10 +93,10 @@ const ≅ = isequal
     @test levels(x) == ["b", "a"]
 
     x = @inferred cut(Matrix{Union{Float64, T}}([-1.1 3.0; 1.456 10.394]), [-2.134, 3.0, 12.5])
-    @test x == ["[-2.134, 3.0)" "[3.0, 12.5)"; "[-2.134, 3.0)" "[3.0, 12.5)"]
+    @test x == ["[-2.134, 3.0)" "[3.0, 12.5]"; "[-2.134, 3.0)" "[3.0, 12.5]"]
     @test isa(x, CategoricalMatrix{Union{String, T}})
     @test isordered(x)
-    @test levels(x) == ["[-2.134, 3.0)", "[3.0, 12.5)"]
+    @test levels(x) == ["[-2.134, 3.0)", "[3.0, 12.5]"]
 
     labels = 0:2:8
     x = @inferred cut(Vector{Union{T, Int}}(1:8), 0:2:10, labels=labels)
@@ -179,7 +189,7 @@ end
     @test_throws ArgumentError cut(1:10, [1, 5, 5, 11])
     y = cut(1:10, [1, 5, 5, 11], allowempty=true)
     @test y == cut(1:10, [1, 5, 11])
-    @test levels(y) == ["[1, 5)", "(5, 5)", "[5, 11)"]
+    @test levels(y) == ["[1, 5)", "(5, 5)", "[5, 11]"]
 
     @test_throws ArgumentError cut(1:10, [1, 5, 5, 5, 11])
     @test_throws ArgumentError cut(1:10, [1, 5, 5, 11],
@@ -191,29 +201,29 @@ end
 
     @test_throws ArgumentError cut(1:10, [1, 5, 5, 11], labels=string.(1:3))
     y = cut(1:10, [1, 5, 5, 11], allowempty=true, labels=string.(1:3))
-    @test y == recode(cut(1:10, [1, 5, 11]), "[1, 5)" => "1", "[5, 11)" => "3")
+    @test y == recode(cut(1:10, [1, 5, 11]), "[1, 5)" => "1", "[5, 11]" => "3")
     @test levels(y) == string.(1:3)
 
     @test_throws ArgumentError cut(1:10, [1, 5, 5, 5, 11], labels=string.(1:4))
     y = cut(1:10, [1, 5, 5, 5, 11], allowempty=true, labels=string.(1:4))
-    @test y == recode(cut(1:10, [1, 5, 11]), "[1, 5)" => "1", "[5, 11)" => "4")
+    @test y == recode(cut(1:10, [1, 5, 11]), "[1, 5)" => "1", "[5, 11]" => "4")
     @test levels(y) == string.(1:4)
 
     @test_throws ArgumentError cut(1:10, [1, 5, 5, 5, 5, 11], labels=string.(1:5))
     y = cut(1:10, [1, 5, 5, 5, 5, 11], allowempty=true, labels=string.(1:5))
-    @test y == recode(cut(1:10, [1, 5, 11]), "[1, 5)" => "1", "[5, 11)" => "5")
+    @test y == recode(cut(1:10, [1, 5, 11]), "[1, 5)" => "1", "[5, 11]" => "5")
     @test levels(y) == string.(1:5)
 
     @test_throws ArgumentError cut(1:10, [1, 3, 3, 5, 5, 11], labels=string.(1:5))
     y = cut(1:10, [1, 3, 3, 5, 5, 11], allowempty=true, labels=string.(1:5))
     @test y == recode(cut(1:10, [1, 3, 5, 11]),
-                      "[1, 3)" => "1", "[3, 5)" => "3", "[5, 11)" => "5")
+                      "[1, 3)" => "1", "[3, 5)" => "3", "[5, 11]" => "5")
     @test levels(y) == string.(1:5)
 
     @test_throws ArgumentError cut(1:10, [1, 3, 3, 3, 5, 5, 5, 11], labels=string.(1:7))
     y = cut(1:10, [1, 3, 3, 3, 5, 5, 5, 11], allowempty=true, labels=string.(1:7))
     @test y == recode(cut(1:10, [1, 3, 5, 11]),
-                      "[1, 3)" => "1", "[3, 5)" => "4", "[5, 11)" => "7")
+                      "[1, 3)" => "1", "[3, 5)" => "4", "[5, 11]" => "7")
     @test levels(y) == string.(1:7)
 
     @test_throws ArgumentError cut(1:10, [1, 3, 5, 5, 11],
@@ -255,9 +265,9 @@ end
 end
 
 @testset "cut with extend=missing" begin
-    x = @inferred cut([-0.0, 0.0, 1.0, 2.0, 3.0, 4.0], [-0.0, 0.0, 3.0],
+    x = @inferred cut([-0.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0], [-0.0, 0.0, 3.0],
                       labels=[-0.0, 0.0], extend=missing)
-    @test x ≅ [-0.0, 0.0, 0.0, 0.0, missing, missing]
+    @test x ≅ [-0.0, 0.0, 0.0, 0.0, 0.0, missing, missing]
     @test x isa CategoricalArray{Union{Missing, Float64},1,UInt32}
     @test isordered(x)
     @test levels(x) == [-0.0, 0.0]
diff --git a/test/17_deprecated.jl b/test/17_deprecated.jl
index bc492484..d5a08ff4 100644
--- a/test/17_deprecated.jl
+++ b/test/17_deprecated.jl
@@ -10,7 +10,7 @@ const ≅ = isequal
     @test x ≅ ["a", missing, missing]
 
     x = cut([1, missing, 100], [1, 2], allow_missing=true)
-    @test x ≅ ["[1, 2)", missing, missing]
+    @test x ≅ ["[1, 2]", missing, missing]
 end
 
 end
\ No newline at end of file

From 9eca0c765fdbc03944420eebeefb1d3f3e1a0e8a Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Sat, 28 Dec 2024 15:15:48 +0100
Subject: [PATCH 04/25] CompatHelper: add new compat entry for Statistics at
 version 1, (keep existing compat) (#411)

Co-authored-by: CompatHelper Julia <compathelper_noreply@julialang.org>
---
 Project.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Project.toml b/Project.toml
index 5846e340..6b388bcb 100644
--- a/Project.toml
+++ b/Project.toml
@@ -31,6 +31,7 @@ Missings = "0.4.3, 1"
 RecipesBase = "1.1"
 Requires = "1"
 SentinelArrays = "1"
+Statistics = "1"
 StructTypes = "1"
 julia = "1"
 
@@ -46,5 +47,4 @@ StructTypes = "856f2bd8-1eba-4b0a-8007-ebc267875bd4"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Dates", "JSON", "JSON3", "Plots", "PooledArrays",
-        "RecipesBase", "SentinelArrays", "StructTypes", "Test"]
+test = ["Dates", "JSON", "JSON3", "Plots", "PooledArrays", "RecipesBase", "SentinelArrays", "StructTypes", "Test"]

From 9f9acba2ebbb7aeb2c4f080b51c7bdc97b9ebcb9 Mon Sep 17 00:00:00 2001
From: Milan Bouchet-Valat <nalimilan@club.fr>
Date: Sat, 28 Dec 2024 15:23:39 +0100
Subject: [PATCH 05/25] Update CI

---
 .github/workflows/ci.yml | 10 +++++-----
 docs/Project.toml        |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index aaeda107..06fddf55 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -26,12 +26,12 @@ jobs:
           - os: macOS-latest
             arch: x86
     steps:
-      - uses: actions/checkout@v2
-      - uses: julia-actions/setup-julia@v1
+      - uses: actions/checkout@v4
+      - uses: julia-actions/setup-julia@v2
         with:
           version: ${{ matrix.version }}
           arch: ${{ matrix.arch }}
-      - uses: actions/cache@v1
+      - uses: actions/cache@v2
         env:
           cache-name: cache-artifacts
         with:
@@ -44,14 +44,14 @@ jobs:
       - uses: julia-actions/julia-buildpkg@v1
       - uses: julia-actions/julia-runtest@v1
       - uses: julia-actions/julia-processcoverage@v1
-      - uses: codecov/codecov-action@v1
+      - uses: codecov/codecov-action@v5
         with:
           file: lcov.info
   docs:
     name: Documentation
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - uses: julia-actions/julia-buildpkg@latest
       - uses: julia-actions/julia-docdeploy@latest
         env:
diff --git a/docs/Project.toml b/docs/Project.toml
index 1a6d3094..1814eb33 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -2,4 +2,4 @@
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 
 [compat]
-Documenter = "~0.27"
+Documenter = "1"

From 55f000a2d38f4c8c593adf0fa52f9158f379cb7d Mon Sep 17 00:00:00 2001
From: Milan Bouchet-Valat <nalimilan@club.fr>
Date: Sat, 28 Dec 2024 15:44:02 +0100
Subject: [PATCH 06/25] Enable Dependabot

---
 .github/dependabot.yml | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 .github/dependabot.yml

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 00000000..adee0ed1
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,6 @@
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "monthly"
\ No newline at end of file

From 4434fe429e6823e5049093b714bcba00f4f5794a Mon Sep 17 00:00:00 2001
From: Milan Bouchet-Valat <nalimilan@club.fr>
Date: Sat, 28 Dec 2024 15:52:12 +0100
Subject: [PATCH 07/25] Fix Documenter

---
 docs/make.jl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/make.jl b/docs/make.jl
index 6a5e3be8..1b260579 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -15,8 +15,7 @@ makedocs(
         "Implementation details" => "implementation.md",
         "API index" => "apiindex.md"
         ],
-    checkdocs = :exports,
-    strict=true
+    checkdocs = :exports
 )
 
 deploydocs(

From 341de709cfbd4586f8198daa382463654d538fe0 Mon Sep 17 00:00:00 2001
From: Milan Bouchet-Valat <nalimilan@club.fr>
Date: Mon, 30 Dec 2024 22:47:04 +0100
Subject: [PATCH 08/25] Fix corner cases of cut (#410)

Apply more systematically the rule that all intervals are closed on the left
and open on the right except the last one. Throw an error when
duplicated breaks this would lead to empty intervals unless `allowempty=true`.
Improve handling of -0.0, NaN and Inf.
---
 src/extras.jl     | 66 ++++++++++++++++++++--------------
 test/15_extras.jl | 91 +++++++++++++++++++++++++++++++++++++++++++++--
 test/runtests.jl  |  2 ++
 3 files changed, 130 insertions(+), 29 deletions(-)

diff --git a/src/extras.jl b/src/extras.jl
index f536f06f..2afcef38 100644
--- a/src/extras.jl
+++ b/src/extras.jl
@@ -9,11 +9,14 @@ function fill_refs!(refs::AbstractArray, X::AbstractArray,
     @inbounds for i in eachindex(X)
         x = X[i]
 
-        if ismissing(x)
+        if x isa Number && isnan(x)
+            throw(ArgumentError("NaN values are not allowed in input vector"))
+        elseif ismissing(x)
             refs[i] = 0
-        elseif x == upper
+        elseif isequal(x, upper)
             refs[i] = n-1
-        elseif extend !== true && !(lower <= x <= upper)
+        elseif extend !== true &&
+            !((isless(lower, x) || isequal(x, lower)) && isless(x, upper))
             extend === missing ||
                 throw(ArgumentError("value $x (at index $i) does not fall inside the breaks: " *
                                     "adapt them manually, or pass extend=true or extend=missing"))
@@ -55,10 +58,10 @@ also accept them.
   the intervals; or a function `f(from, to, i; leftclosed, rightclosed)` that generates
   the labels from the left and right interval boundaries and the group index. Defaults to
   `"[from, to)"` (or `"[from, to]"` for the rightmost interval if `extend == true`).
-* `allowempty::Bool=false`: when `false`, an error is raised if some breaks appear
-  multiple times, generating empty intervals; when `true`, duplicate breaks are allowed
-  and the intervals they generate are kept as unused levels
-  (but duplicate labels are not allowed).
+* `allowempty::Bool=false`: when `false`, an error is raised if some breaks other than
+  the last one appear multiple times, generating empty intervals; when `true`,
+  duplicate breaks are allowed and the intervals they generate are kept as
+  unused levels (but duplicate labels are not allowed).
 
 # Examples
 ```jldoctest
@@ -132,14 +135,19 @@ function _cut(x::AbstractArray{T, N}, breaks::AbstractVector,
               extend::Union{Bool, Missing},
               labels::Union{AbstractVector{<:SupportedTypes},Function},
               allowempty::Bool=false) where {T, N}
-    if !allowempty && !allunique(breaks)
-        throw(ArgumentError("all breaks must be unique unless `allowempty=true`"))
-    end
-
     if !issorted(breaks)
         breaks = sort(breaks)
     end
 
+    if any(x -> x isa Number && isnan(x), breaks)
+        throw(ArgumentError("NaN values are not allowed in breaks"))
+    end
+
+    if !allowempty && !allunique(@view breaks[1:end-1])
+        throw(ArgumentError("all breaks other than the last one must be unique " *
+                            "unless `allowempty=true`"))
+    end
+
     if extend === true
         xnm = T >: Missing ? skipmissing(x) : x
         length(breaks) >= 1 || throw(ArgumentError("at least one break must be provided"))
@@ -158,11 +166,11 @@ function _cut(x::AbstractArray{T, N}, breaks::AbstractVector,
                 rethrow(err)
             end
         end
-        if !ismissing(min_x) && breaks[1] > min_x
+        if !ismissing(min_x) && isless(min_x, breaks[1])
             # this type annotation is needed on Julia<1.7 for stable inference
             breaks = [min_x::nonmissingtype(eltype(x)); breaks]
         end
-        if !ismissing(max_x) && breaks[end] < max_x
+        if !ismissing(max_x) && isless(breaks[end], max_x)
             breaks = [breaks; max_x::nonmissingtype(eltype(x))]
         end
         length(breaks) > 1 ||
@@ -189,16 +197,15 @@ function _cut(x::AbstractArray{T, N}, breaks::AbstractVector,
         from = breaks[1:n-1]
         to = breaks[2:n]
         firstlevel = labels(from[1], to[1], 1,
-                            leftclosed=breaks[1] != breaks[2], rightclosed=false)
+                            leftclosed=!isequal(breaks[1], breaks[2]), rightclosed=false)
         levs = Vector{typeof(firstlevel)}(undef, n-1)
         levs[1] = firstlevel
         for i in 2:n-2
             levs[i] = labels(from[i], to[i], i,
-                             leftclosed=breaks[i] != breaks[i+1], rightclosed=false)
+                             leftclosed=!isequal(breaks[i], breaks[i+1]), rightclosed=false)
         end
         levs[end] = labels(from[end], to[end], n-1,
-                           leftclosed=breaks[end-1] != breaks[end],
-                           rightclosed=true)
+                           leftclosed=true, rightclosed=true)
     else
         length(labels) == n-1 ||
             throw(ArgumentError("labels must be of length $(n-1), but got length $(length(labels))"))
@@ -243,21 +250,28 @@ quantiles.
   the labels from the left and right interval boundaries and the group index. Defaults to
   `"Qi: [from, to)"` (or `"Qi: [from, to]"` for the rightmost interval).
 * `allowempty::Bool=false`: when `false`, an error is raised if some quantiles breakpoints
-  are equal, generating empty intervals; when `true`, duplicate breaks are allowed
-  and the intervals they generate are kept as unused levels
-  (but duplicate labels are not allowed).
+  other than the last one are equal, generating empty intervals;
+  when `true`, duplicate breaks are allowed and the intervals they generate are kept as
+  unused levels (but duplicate labels are not allowed).
 """
 function cut(x::AbstractArray, ngroups::Integer;
              labels::Union{AbstractVector{<:SupportedTypes},Function}=quantile_formatter,
              allowempty::Bool=false)
+    ngroups >= 1 || throw(ArgumentError("ngroups must be strictly positive (got $ngroups)"))
     xnm = eltype(x) >: Missing ? skipmissing(x) : x
-    breaks = Statistics.quantile(xnm, (1:ngroups-1)/ngroups)
-    if !allowempty && !allunique(breaks)
-        n = length(unique(breaks)) - 1
-        throw(ArgumentError("cannot compute $ngroups quantiles: `quantile` " *
-                            "returned only $n groups due to duplicated values in `x`." *
+    # Computing extrema is faster than taking 0 and 1 quantiles
+    min_x, max_x = extrema(xnm)
+    if (min_x isa Number && isnan(min_x)) ||
+        (max_x isa Number && isnan(max_x))
+        throw(ArgumentError("NaN values are not allowed in input vector"))
+    end
+    breaks = quantile(xnm, (1:ngroups-1)/ngroups)
+    breaks = [min_x; breaks; max_x]
+    if !allowempty && !allunique(@view breaks[1:end-1])
+        throw(ArgumentError("cannot compute $ngroups quantiles due to " *
+                            "too many duplicated values in `x`. " *
                             "Pass `allowempty=true` to allow empty quantiles or " *
                             "choose a lower value for `ngroups`."))
     end
-    cut(x, breaks; extend=true, labels=labels, allowempty=allowempty)
+    cut(x, breaks; labels=labels, allowempty=allowempty)
 end
diff --git a/test/15_extras.jl b/test/15_extras.jl
index 14fb4352..1aaf8dc7 100644
--- a/test/15_extras.jl
+++ b/test/15_extras.jl
@@ -111,9 +111,6 @@ const ≅ = isequal
     @test isa(x, CategoricalVector{Union{Int, String, T}})
     @test isordered(x)
     @test levels(x) == [0, "2", 4, "6", 8]
-
-    @test_throws ArgumentError cut([-0.0, 0.0], 2)
-    @test_throws ArgumentError cut([-0.0, 0.0], 2, labels=[-0.0, 0.0])
 end
 
 @testset "cut with missing values in input" begin
@@ -144,6 +141,11 @@ end
     @test levels(x) == ["Q1: [2.0, 3.5)", "Q2: [3.5, 5.0]"]
 end
 
+@testset "cut(x, n) with invalid n" begin
+    @test_throws ArgumentError cut(1:10, 0)
+    @test_throws ArgumentError cut(1:10, -1)
+end
+
 @testset "cut with formatter function" begin
     my_formatter(from, to, i; leftclosed, rightclosed) = "$i: $from -- $to"
 
@@ -185,11 +187,20 @@ end
     x = [zeros(10); ones(10)]
     @test_throws ArgumentError cut(x, [0, 0.1, 0.1, 10])
     @test_throws ArgumentError cut(x, 10)
+    y = cut(x, [0, 0.1, 10, 10])
+    @test y == [fill("[0.0, 0.1)", 10); fill("[0.1, 10.0)", 10)]
+    @test levels(y) == ["[0.0, 0.1)", "[0.1, 10.0)", "[10.0, 10.0]"]
 
     @test_throws ArgumentError cut(1:10, [1, 5, 5, 11])
     y = cut(1:10, [1, 5, 5, 11], allowempty=true)
     @test y == cut(1:10, [1, 5, 11])
     @test levels(y) == ["[1, 5)", "(5, 5)", "[5, 11]"]
+    y = cut(1:10, [1, 5, 11, 11])
+    @test y == [fill("[1, 5)", 4); fill("[5, 11)", 6)]
+    @test levels(y) == ["[1, 5)", "[5, 11)", "[11, 11]"]
+    y = cut(1:10, [1, 5, 10, 10])
+    @test y == [fill("[1, 5)", 4); fill("[5, 10)", 5); "[10, 10]"]
+    @test levels(y) == ["[1, 5)", "[5, 10)", "[10, 10]"]
 
     @test_throws ArgumentError cut(1:10, [1, 5, 5, 5, 11])
     @test_throws ArgumentError cut(1:10, [1, 5, 5, 11],
@@ -242,6 +253,49 @@ end
 
     fmt = (from, to, i; leftclosed, rightclosed) -> (i % 2 == 0 ? to : 0.0)
     @test_throws ArgumentError cut(1:8, 0:2:10, labels=fmt)
+
+    @test_throws ArgumentError cut([fill(1, 10); 4], 2)
+    @test_throws ArgumentError cut([fill(1, 10); 4], 3)
+    x = cut([fill(1, 10); 4], 2, allowempty=true)
+    @test unique(x) == ["Q2: [1.0, 4.0]"]
+    x = cut([fill(1, 10); 4], 3, allowempty=true)
+    @test unique(x) == ["Q3: [1.0, 4.0]"]
+    @test levels(x) == ["Q1: (1.0, 1.0)", "Q2: (1.0, 1.0)", "Q3: [1.0, 4.0]"]
+
+    x = cut([fill(1, 5); fill(4, 5)], 2)
+    @test x == [fill("Q1: [1.0, 2.5)", 5); fill("Q2: [2.5, 4.0]", 5)]
+    @test levels(x) == ["Q1: [1.0, 2.5)", "Q2: [2.5, 4.0]"]
+    @test_throws ArgumentError  cut([fill(1, 5); fill(4, 5)], 3)
+    x = cut([fill(1, 5); fill(4, 5)], 3, allowempty=true)
+    @test x == [fill("Q2: [1.0, 4.0)", 5); fill("Q3: [4.0, 4.0]", 5)]
+    @test levels(x) == ["Q1: (1.0, 1.0)", "Q2: [1.0, 4.0)", "Q3: [4.0, 4.0]"]
+end
+
+@testset "cut with -0.0" begin
+    x = cut([-0.0, 0.0, 0.0, -0.0], 2)
+    @test x == ["Q1: [-0.0, 0.0)", "Q2: [0.0, 0.0]", "Q2: [0.0, 0.0]", "Q1: [-0.0, 0.0)"]
+    @test levels(x) == ["Q1: [-0.0, 0.0)", "Q2: [0.0, 0.0]"]
+
+    x = cut([-0.0, 0.0, 0.0, -0.0], [-0.0, 0.0, 0.0])
+    @test x == ["[-0.0, 0.0)", "[0.0, 0.0]", "[0.0, 0.0]", "[-0.0, 0.0)"]
+    @test levels(x) == ["[-0.0, 0.0)", "[0.0, 0.0]"]
+
+    x = cut([-0.0, 0.0, 0.0, -0.0], [-0.0, 0.0])
+    @test x == fill("[-0.0, 0.0]", 4)
+    @test levels(x) == ["[-0.0, 0.0]"]
+
+    x = cut([-0.0, 0.0, 0.0, -0.0], [0.0], extend=true)
+    @test x == fill("[-0.0, 0.0]", 4)
+    @test levels(x) == ["[-0.0, 0.0]"]
+
+    x = cut([-0.0, 0.0, 0.0, -0.0], [-0.0], extend=true)
+    @test x == fill("[-0.0, 0.0]", 4)
+    @test levels(x) == ["[-0.0, 0.0]"]
+
+    x = cut([-0.0, 0.0, 0.0, -0.0], 2, labels=[-0.0, 0.0])
+    @test x == [-0.0, 0.0, 0.0, -0.0]
+
+    @test_throws ArgumentError cut([-0.0, 0.0, 0.0, -0.0], [-0.0, -0.0, 0.0])
 end
 
 @testset "cut with extend=true" begin
@@ -276,4 +330,35 @@ end
     @test x == ["[-1.0, 0.0)", "[-1.0, 0.0)", "[0.0, 1.0]", "[0.0, 1.0]", "[0.0, 1.0]"]
 end
 
+@testset "cut with NaN and Inf" begin
+    @test_throws ArgumentError("NaN values are not allowed in input vector") cut([1, NaN, 2, 3], [1, 10])
+    @test_throws ArgumentError("NaN values are not allowed in input vector") cut([1, NaN, 2, 3], [1], extend=true)
+    @test_throws ArgumentError("NaN values are not allowed in input vector") cut([1, NaN, 2, 3], 2)
+    @test_throws ArgumentError("NaN values are not allowed in breaks") cut([1, 2], [1, NaN])
+
+    x = cut([1, Inf], [1], extend=true)
+    @test x ≅ ["[1.0, Inf]", "[1.0, Inf]"]
+    @test levels(x) == ["[1.0, Inf]"]
+
+    x = cut([1, -Inf], [1], extend=true)
+    @test x ≅ ["[-Inf, 1.0]", "[-Inf, 1.0]"]
+    @test levels(x) == ["[-Inf, 1.0]"]
+
+    x = cut([1:5; Inf], [1, 2, Inf])
+    @test x ≅ ["[1.0, 2.0)"; fill("[2.0, Inf]", 5)]
+    @test levels(x) == ["[1.0, 2.0)", "[2.0, Inf]"]
+
+    x = cut([1:5; -Inf], [-Inf, 2, 5])
+    @test x ≅ ["[-Inf, 2.0)"; fill("[2.0, 5.0]", 4); "[-Inf, 2.0)"]
+    @test levels(x) == ["[-Inf, 2.0)", "[2.0, 5.0]"]
+
+    x = cut([1:5; Inf], 2)
+    @test x ≅ [fill("Q1: [1.0, 3.5)", 3); fill("Q2: [3.5, Inf]", 3)]
+    @test levels(x) == ["Q1: [1.0, 3.5)", "Q2: [3.5, Inf]"]
+
+    x = cut([1:5; -Inf], 2)
+    @test x ≅ [fill("Q1: [-Inf, 2.5)", 2); fill("Q2: [2.5, 5.0]", 3); "Q1: [-Inf, 2.5)"]
+    @test levels(x) == ["Q1: [-Inf, 2.5)", "Q2: [2.5, 5.0]"]
 end
+
+end
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index 142bd15f..e59180e7 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -10,6 +10,8 @@ module TestCategoricalArrays
     using Test
     using CategoricalArrays
 
+    const ≊ = isequal
+
     tests = [
         "01_value.jl",
         "04_constructors.jl",

From 3e0d05653ed9b92fefecdc059f9358f8601a1af6 Mon Sep 17 00:00:00 2001
From: Tiem van der Deure <tiemvanderdeure@gmail.com>
Date: Fri, 3 Jan 2025 23:38:10 +0100
Subject: [PATCH 09/25] Make recode! type stable (#407)

Varargs appear to be type-stable according to `@code_warntype`
but in practice that's not the case.
---
 Project.toml             |  2 +
 src/CategoricalArrays.jl |  1 +
 src/recode.jl            | 84 +++++++++++++++++++---------------------
 3 files changed, 43 insertions(+), 44 deletions(-)

diff --git a/Project.toml b/Project.toml
index 6b388bcb..4593b00b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -3,6 +3,7 @@ uuid = "324d7699-5711-5eae-9e2f-1d82baa6b597"
 version = "0.10.8"
 
 [deps]
+Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"
 DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
 Future = "9fa8497b-333b-5362-9e8d-4d0656e87820"
 Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
@@ -24,6 +25,7 @@ CategoricalArraysSentinelArraysExt = "SentinelArrays"
 CategoricalArraysStructTypesExt = "StructTypes"
 
 [compat]
+Compat = "3.37, 4"
 DataAPI = "1.6"
 JSON = "0.15, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21"
 JSON3 = "1.1.2"
diff --git a/src/CategoricalArrays.jl b/src/CategoricalArrays.jl
index 214a5d17..a28cba94 100644
--- a/src/CategoricalArrays.jl
+++ b/src/CategoricalArrays.jl
@@ -14,6 +14,7 @@ module CategoricalArrays
     using DataAPI
     using Missings
     using Printf
+    import Compat
 
     # JuliaLang/julia#36810
     if VERSION < v"1.5.2"
diff --git a/src/recode.jl b/src/recode.jl
index 282d4fb6..141f9967 100644
--- a/src/recode.jl
+++ b/src/recode.jl
@@ -52,27 +52,34 @@ A user defined type could override this method to define an appropriate test fun
 optimize_pair(pair::Pair) = pair
 optimize_pair(pair::Pair{<:AbstractArray}) = Set(pair.first) => pair.second
 
-function recode!(dest::AbstractArray{T}, src::AbstractArray, default::Any, pairs::Pair...) where {T}
+function recode!(dest::AbstractArray, src::AbstractArray, default::Any, pairs::Pair...)
     if length(dest) != length(src)
         throw(DimensionMismatch("dest and src must be of the same length (got $(length(dest)) and $(length(src)))"))
     end
 
-    opt_pairs = map(optimize_pair, pairs)
+    opt_pairs = optimize_pair.(pairs)
 
+    _recode!(dest, src, default, opt_pairs)
+end
+
+function _recode!(dest::AbstractArray{T}, src::AbstractArray, default,
+                  pairs::NTuple{<:Any, Pair}) where {T}
+    recode_to = last.(pairs)
+    recode_from = first.(pairs)
+    
     @inbounds for i in eachindex(dest, src)
         x = src[i]
 
-        for j in 1:length(opt_pairs)
-            p = opt_pairs[j]
-            # we use isequal and recode_in because we cannot really distinguish scalars from collections
-            if x ≅ p.first || recode_in(x, p.first)
-                dest[i] = p.second
-                @goto nextitem
-            end
-        end
-
+        # @inline is needed for type stability and Compat for compatibility before julia v1.8
+        # we use isequal and recode_in because we cannot really
+        # distinguish scalars from collections
+        j = Compat.@inline findfirst(y -> isequal(x, y) || recode_in(x,y), recode_from)
+        
+        # Value in one of the pairs
+        if j !== nothing
+            dest[i] = recode_to[j]
         # Value not in any of the pairs
-        if ismissing(x)
+        elseif ismissing(x)
             eltype(dest) >: Missing ||
                 throw(MissingException("missing value found, but dest does not support them: " *
                                        "recode them to a supported value"))
@@ -89,21 +96,16 @@ function recode!(dest::AbstractArray{T}, src::AbstractArray, default::Any, pairs
         else
             dest[i] = default
         end
-
-        @label nextitem
     end
 
     dest
 end
 
-function recode!(dest::CategoricalArray{T}, src::AbstractArray, default::Any, pairs::Pair...) where {T}
-    if length(dest) != length(src)
-        throw(DimensionMismatch("dest and src must be of the same length (got $(length(dest)) and $(length(src)))"))
-    end
-
-    opt_pairs = map(optimize_pair, pairs)
+function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, default::Any,
+                  pairs::NTuple{<:Any, Pair}) where {T, R}
+    recode_from = first.(pairs)
+    vals = T[p.second for p in pairs]
 
-    vals = T[p.second for p in opt_pairs]
     default !== nothing && push!(vals, default)
 
     levels!(dest.pool, filter!(!ismissing, unique(vals)))
@@ -112,22 +114,22 @@ function recode!(dest::CategoricalArray{T}, src::AbstractArray, default::Any, pa
     dupvals = length(vals) != length(levels(dest.pool))
 
     drefs = dest.refs
-    pairmap = [ismissing(v) ? 0 : get(dest.pool, v) for v in vals]
-    defaultref = default === nothing || ismissing(default) ? 0 : get(dest.pool, default)
+    pairmap = [ismissing(v) ? zero(R) : get(dest.pool, v) for v in vals]
+    defaultref = default === nothing || ismissing(default) ? zero(R) : get(dest.pool, default)
+
     @inbounds for i in eachindex(drefs, src)
         x = src[i]
 
-        for j in 1:length(opt_pairs)
-            p = opt_pairs[j]
-            # we use isequal and recode_in because we cannot really distinguish scalars from collections
-            if x ≅ p.first || recode_in(x, p.first)
-                drefs[i] = dupvals ? pairmap[j] : j
-                @goto nextitem
-            end
-        end
+        # @inline is needed for type stability and Compat for compatibility before julia v1.8  
+        # we use isequal and recode_in because we cannot really
+        # distinguish scalars from collections
+        j = Compat.@inline findfirst(y -> isequal(x, y) || recode_in(x, y), recode_from)
 
+        # Value in one of the pairs
+        if j !== nothing
+            drefs[i] = dupvals ? pairmap[j] : j
         # Value not in any of the pairs
-        if ismissing(x)
+        elseif ismissing(x)
             eltype(dest) >: Missing ||
                 throw(MissingException("missing value found, but dest does not support them: " *
                                        "recode them to a supported value"))
@@ -144,8 +146,6 @@ function recode!(dest::CategoricalArray{T}, src::AbstractArray, default::Any, pa
         else
             drefs[i] = defaultref
         end
-
-        @label nextitem
     end
 
     # Put existing levels first, and sort them if possible
@@ -168,25 +168,21 @@ function recode!(dest::CategoricalArray{T}, src::AbstractArray, default::Any, pa
     dest
 end
 
-function recode!(dest::CategoricalArray{T, N, R}, src::CategoricalArray,
-                 default::Any, pairs::Pair...) where {T, N, R<:Integer}
-    if length(dest) != length(src)
-        throw(DimensionMismatch("dest and src must be of the same length " *
-                                "(got $(length(dest)) and $(length(src)))"))
-    end
-
+function _recode!(dest::CategoricalArray{T, N, R}, src::CategoricalArray,
+                  default::Any, pairs::NTuple{<:Any, Pair}) where {T, N, R<:Integer}
+    recode_from = first.(pairs)
     vals = T[p.second for p in pairs]
+             
     if default === nothing
         srclevels = levels(src)
 
         # Remove recoded levels as they won't appear in result
-        firsts = (p.first for p in pairs)
         keptlevels = Vector{T}(undef, 0)
         sizehint!(keptlevels, length(srclevels))
 
         for l in srclevels
-            if !(any(x -> x ≅ l, firsts) ||
-                 any(f -> recode_in(l, f), firsts))
+            if !(any(x -> x ≅ l, recode_from) ||
+                 any(f -> recode_in(l, f), recode_from))
                 try
                     push!(keptlevels, l)
                 catch err

From 2d16eafeb8a8117c63a1ae590bb0a2c726dbaf53 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 22 Feb 2025 11:18:37 +0100
Subject: [PATCH 10/25] Bump actions/cache from 2 to 4 (#414)

Bumps [actions/cache](https://github.com/actions/cache) from 2 to 4.
- [Release notes](https://github.com/actions/cache/releases)
- [Changelog](https://github.com/actions/cache/blob/main/RELEASES.md)
- [Commits](https://github.com/actions/cache/compare/v2...v4)

---
updated-dependencies:
- dependency-name: actions/cache
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 06fddf55..1fb7fb41 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -31,7 +31,7 @@ jobs:
         with:
           version: ${{ matrix.version }}
           arch: ${{ matrix.arch }}
-      - uses: actions/cache@v2
+      - uses: actions/cache@v4
         env:
           cache-name: cache-artifacts
         with:

From d0f708104e110b742298ef6df6cc828b17ae014e Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 22 Feb 2025 11:19:37 +0100
Subject: [PATCH 11/25] Bump julia-actions/setup-julia from 1 to 2 (#413)

Bumps [julia-actions/setup-julia](https://github.com/julia-actions/setup-julia) from 1 to 2.
- [Release notes](https://github.com/julia-actions/setup-julia/releases)
- [Changelog](https://github.com/julia-actions/setup-julia/blob/master/devdocs/making_a_new_release.md)
- [Commits](https://github.com/julia-actions/setup-julia/compare/v1...v2)

---
updated-dependencies:
- dependency-name: julia-actions/setup-julia
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/CompatHelper.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml
index 8d889a9d..e628f26d 100644
--- a/.github/workflows/CompatHelper.yml
+++ b/.github/workflows/CompatHelper.yml
@@ -15,7 +15,7 @@ jobs:
         run: which julia
         continue-on-error: true
       - name: Install Julia, but only if it is not already available in the PATH
-        uses: julia-actions/setup-julia@v1
+        uses: julia-actions/setup-julia@v2
         with:
           version: '1'
           arch: ${{ runner.arch }}

From f313cb09505840b70eae791e62912d3bb4885941 Mon Sep 17 00:00:00 2001
From: Milan Bouchet-Valat <nalimilan@club.fr>
Date: Tue, 29 Apr 2025 11:53:30 +0200
Subject: [PATCH 12/25] Fix allocations by dropping `CategoricalPool` type
 parameter (#418)

Self-referential types generate allocations since Julia 1.11 (JuliaLang/julia#58169).
This third parameter seems to have been unnecessary since `NominalValue`
and `OrdinalValue` got merged into a single `CategoricalValue` type.
---
 src/array.jl            |  3 +--
 src/pool.jl             | 14 ++++++--------
 src/typedefs.jl         | 39 ++++++++++++++++-----------------------
 test/04_constructors.jl | 22 +++++-----------------
 4 files changed, 28 insertions(+), 50 deletions(-)

diff --git a/src/array.jl b/src/array.jl
index ffbf66b8..6950d1fa 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -160,9 +160,8 @@ function CategoricalArray{T, N, R}(::UndefInitializer, dims::NTuple{N,Int};
     U = leveltype(nonmissingtype(T))
     S = T >: Missing ? Union{U, Missing} : U
     check_supported_eltype(S, T)
-    V = CategoricalValue{U, R}
     levs = levels === nothing ? U[] : collect(U, levels)
-    CategoricalArray{S, N}(zeros(R, dims), CategoricalPool{U, R, V}(levs, ordered))
+    CategoricalArray{S, N}(zeros(R, dims), CategoricalPool{U, R}(levs, ordered))
 end
 
 CategoricalArray{T, N}(::UndefInitializer, dims::NTuple{N,Int};
diff --git a/src/pool.jl b/src/pool.jl
index 0ece21ce..9753a76d 100644
--- a/src/pool.jl
+++ b/src/pool.jl
@@ -2,20 +2,18 @@ const catpool_seed = UInt === UInt32 ? 0xe3cf1386 : 0x356f2c715023f1a5
 
 hashlevels(levs::AbstractVector) = foldl((h, x) -> hash(x, h), levs, init=catpool_seed)
 
-CategoricalPool{T, R, V}(ordered::Bool=false) where {T, R, V} =
-    CategoricalPool{T, R, V}(T[], ordered)
 CategoricalPool{T, R}(ordered::Bool=false) where {T, R} =
     CategoricalPool{T, R}(T[], ordered)
 CategoricalPool{T}(ordered::Bool=false) where {T} =
     CategoricalPool{T, DefaultRefType}(T[], ordered)
 
 CategoricalPool{T, R}(levels::AbstractVector, ordered::Bool=false) where {T, R} =
-    CategoricalPool{T, R, CategoricalValue{T, R}}(convert(Vector{T}, levels), ordered)
+    CategoricalPool{T, R}(convert(Vector{T}, levels), ordered)
 CategoricalPool(levels::AbstractVector{T}, ordered::Bool=false) where {T} =
     CategoricalPool{T, DefaultRefType}(convert(Vector{T}, levels), ordered)
 
 CategoricalPool(invindex::Dict{T, R}, ordered::Bool=false) where {T, R <: Integer} =
-    CategoricalPool{T, R, CategoricalValue{T, R}}(invindex, ordered)
+    CategoricalPool{T, R}(invindex, ordered)
 
 Base.convert(::Type{T}, pool::T) where {T <: CategoricalPool} = pool
 
@@ -29,12 +27,12 @@ function Base.convert(::Type{CategoricalPool{T, R}}, pool::CategoricalPool) wher
 
     levelsT = convert(Vector{T}, pool.levels)
     invindexT = convert(Dict{T, R}, pool.invindex)
-    return CategoricalPool{T, R, CategoricalValue{T, R}}(levelsT, invindexT, pool.ordered)
+    return CategoricalPool{T, R}(levelsT, invindexT, pool.ordered)
 end
 
-Base.copy(pool::CategoricalPool{T, R, V}) where {T, R, V} =
-    CategoricalPool{T, R, V}(copy(pool.levels), copy(pool.invindex),
-                             pool.ordered, pool.hash)
+Base.copy(pool::CategoricalPool{T, R}) where {T, R} =
+    CategoricalPool{T, R}(copy(pool.levels), copy(pool.invindex),
+                          pool.ordered, pool.hash)
 
 function Base.show(io::IO, pool::CategoricalPool{T, R}) where {T, R}
     @static if VERSION >= v"1.6.0"
diff --git a/src/typedefs.jl b/src/typedefs.jl
index 973cbaf8..0f9aa414 100644
--- a/src/typedefs.jl
+++ b/src/typedefs.jl
@@ -6,8 +6,7 @@ const SupportedTypes = Union{AbstractString, AbstractChar, Number}
 # Type params:
 # * `T` type of categorized values
 # * `R` integer type for referencing category levels
-# * `V` categorical value type
-mutable struct CategoricalPool{T <: SupportedTypes, R <: Integer, V}
+mutable struct CategoricalPool{T <: SupportedTypes, R <: Integer}
     levels::Vector{T}          # category levels ordered by their reference codes
     invindex::Dict{T, R}       # map from category levels to their reference codes
     ordered::Bool              # whether levels can be compared using <
@@ -15,8 +14,8 @@ mutable struct CategoricalPool{T <: SupportedTypes, R <: Integer, V}
     subsetof::Ptr{Nothing}     # last seen strict superset pool
     equalto::Ptr{Nothing}      # last seen equal pool
 
-    function CategoricalPool{T, R, V}(levels::Vector{T},
-                                      ordered::Bool) where {T, R, V}
+    function CategoricalPool{T, R}(levels::Vector{T},
+                                   ordered::Bool) where {T, R}
         if length(levels) > typemax(R)
             throw(LevelsException{T, R}(levels[Int(typemax(R))+1:end]))
         end
@@ -24,10 +23,10 @@ mutable struct CategoricalPool{T <: SupportedTypes, R <: Integer, V}
         if length(invindex) != length(levels)
             throw(ArgumentError("Duplicate entries are not allowed in levels"))
         end
-        CategoricalPool{T, R, V}(levels, invindex, ordered)
+        CategoricalPool{T, R}(levels, invindex, ordered)
     end
-    function CategoricalPool{T, R, V}(invindex::Dict{T, R},
-                                      ordered::Bool) where {T, R, V}
+    function CategoricalPool{T, R}(invindex::Dict{T, R},
+                                   ordered::Bool) where {T, R}
         levels = Vector{T}(undef, length(invindex))
         # If invindex contains non consecutive values, a BoundsError will be thrown
         try
@@ -40,18 +39,12 @@ mutable struct CategoricalPool{T <: SupportedTypes, R <: Integer, V}
         if length(invindex) > typemax(R)
             throw(LevelsException{T, R}(levels[typemax(R)+1:end]))
         end
-        CategoricalPool{T, R, V}(levels, invindex, ordered)
+        CategoricalPool{T, R}(levels, invindex, ordered)
     end
-    function CategoricalPool{T, R, V}(levels::Vector{T},
-                                      invindex::Dict{T, R},
-                                      ordered::Bool,
-                                      hash::Union{UInt, Nothing}=nothing) where {T, R, V}
-        if !(V <: CategoricalValue)
-            throw(ArgumentError("Type $V is not a categorical value type"))
-        end
-        if V !== CategoricalValue{T, R}
-            throw(ArgumentError("V must be CategoricalValue{T, R}"))
-        end
+    function CategoricalPool{T, R}(levels::Vector{T},
+                                   invindex::Dict{T, R},
+                                   ordered::Bool,
+                                   hash::Union{UInt, Nothing}=nothing) where {T, R}
         pool = new(levels, invindex, ordered, hash, C_NULL, C_NULL)
         return pool
     end
@@ -77,7 +70,7 @@ the order of the pool's [`levels`](@ref DataAPI.levels) is used rather than the
 ordering of values of type `T`.
 """
 struct CategoricalValue{T <: SupportedTypes, R <: Integer}
-    pool::CategoricalPool{T, R, CategoricalValue{T, R}}
+    pool::CategoricalPool{T, R}
     ref::R
 end
 
@@ -98,14 +91,14 @@ const AbstractCategoricalMatrix{T, R, V, C, U} = AbstractCategoricalArray{T, 2,
 
 mutable struct CategoricalArray{T, N, R <: Integer, V, C, U} <: AbstractCategoricalArray{T, N, R, V, C, U}
     refs::Array{R, N}
-    pool::CategoricalPool{V, R, C}
+    pool::CategoricalPool{V, R}
 
     function CategoricalArray{T, N}(refs::Array{R, N},
-                                    pool::CategoricalPool{V, R, C}) where
-                                                 {T, N, R <: Integer, V, C}
+                                    pool::CategoricalPool{V, R}) where
+                                                 {T, N, R <: Integer, V}
         T === V || T == Union{V, Missing} || throw(ArgumentError("T ($T) must be equal to $V or Union{$V, Missing}"))
         U = T >: Missing ? Missing : Union{}
-        new{T, N, R, V, C, U}(refs, pool)
+        new{T, N, R, V, CategoricalValue{V, R}, U}(refs, pool)
     end
 end
 const CategoricalVector{T, R <: Integer, V, C, U} = CategoricalArray{T, 1, R, V, C, U}
diff --git a/test/04_constructors.jl b/test/04_constructors.jl
index 5b39f95e..2d4eb4b0 100644
--- a/test/04_constructors.jl
+++ b/test/04_constructors.jl
@@ -5,22 +5,10 @@ using CategoricalArrays: DefaultRefType
 
 @testset "Type parameter constraints" begin
     # cannot use categorical value as level type
-    @test_throws TypeError CategoricalPool{CategoricalValue{Int,UInt8}, UInt8, CategoricalValue{CategoricalValue{Int,UInt8},UInt8}}(
+    @test_throws TypeError CategoricalPool{CategoricalValue{Int,UInt8}, UInt8}(
             Dict{CategoricalValue{Int,UInt8}, UInt8}(), false)
-    @test_throws TypeError CategoricalPool{CategoricalValue{Int,UInt8}, UInt8, CategoricalValue{CategoricalValue{Int,UInt8},UInt8}}(
+    @test_throws TypeError CategoricalPool{CategoricalValue{Int,UInt8}, UInt8}(
                 CategoricalValue{Int,UInt8}[], false)
-    # cannot use non-categorical value as categorical value type
-    @test_throws ArgumentError CategoricalPool{Int, UInt8, Int}(Int[], false)
-    @test_throws ArgumentError CategoricalPool{Int, UInt8, Int}(Dict{Int, UInt8}(), false)
-    # level type of the pool and categorical value must match
-    @test_throws ArgumentError CategoricalPool{Int, UInt8, CategoricalValue{String, UInt8}}(Int[], false)
-    @test_throws ArgumentError CategoricalPool{Int, UInt8, CategoricalValue{String, UInt8}}(Dict{Int, UInt8}(), false)
-    # reference type of the pool and categorical value must match
-    @test_throws ArgumentError CategoricalPool{Int, UInt8, CategoricalValue{Int, UInt16}}(Int[], false)
-    @test_throws ArgumentError CategoricalPool{Int, UInt8, CategoricalValue{Int, UInt16}}(Dict{Int, UInt8}(), false)
-    # correct types combination
-    @test CategoricalPool{Int, UInt8, CategoricalValue{Int, UInt8}}(Int[], false) isa CategoricalPool
-    @test CategoricalPool{Int, UInt8, CategoricalValue{Int, UInt8}}(Dict{Int, UInt8}(), false) isa CategoricalPool
 end
 
 @testset "empty CategoricalPool{String}" begin
@@ -38,7 +26,7 @@ end
 @testset "empty CategoricalPool{Int}" begin
     pool = CategoricalPool{Int, UInt8}()
 
-    @test isa(pool, CategoricalPool{Int, UInt8, CategoricalValue{Int, UInt8}})
+    @test isa(pool, CategoricalPool{Int, UInt8})
 
     @test isa(pool.levels, Vector{Int})
     @test length(pool.levels) == 0
@@ -50,7 +38,7 @@ end
 @testset "CategoricalPool{String, DefaultRefType}(a b c)" begin
     pool = CategoricalPool(["a", "b", "c"])
 
-    @test isa(pool, CategoricalPool{String, UInt32, CategoricalValue{String, UInt32}})
+    @test isa(pool, CategoricalPool{String, UInt32})
 
     @test isa(pool.levels, Vector{String})
     @test pool.levels == ["a", "b", "c"]
@@ -156,7 +144,7 @@ end
 @testset "CategoricalPool{Float64, UInt8}()" begin
     pool = CategoricalPool{Float64, UInt8}([1.0, 2.0, 3.0])
 
-    @test isa(pool, CategoricalPool{Float64, UInt8, CategoricalValue{Float64, UInt8}})
+    @test isa(pool, CategoricalPool{Float64, UInt8})
     @test CategoricalValue(pool, 1) isa CategoricalValue{Float64, UInt8}
 end
 

From 8bfc64785bfbb6c6eba8aaac0506aab720b0d844 Mon Sep 17 00:00:00 2001
From: Alexey Stukalov <alyst@users.noreply.github.com>
Date: Tue, 29 Apr 2025 03:14:28 -0700
Subject: [PATCH 13/25] fix unique() behaviour, add unique!() (#358)

so it conforms to the semantics of the Base.unique()

This is a breaking change that requires a new minor release.
---
 src/array.jl            | 47 +++++++++++++++++++++++------------------
 src/subarray.jl         |  7 ------
 test/11_array.jl        | 23 ++++++++++++++++++--
 test/12_missingarray.jl | 12 +++++++++++
 4 files changed, 59 insertions(+), 30 deletions(-)

diff --git a/src/array.jl b/src/array.jl
index 6950d1fa..04c9ea56 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -1,7 +1,7 @@
 ## Code for CategoricalArray
 
 import Base: Array, convert, collect, copy, getindex, setindex!, similar, size,
-             unique, vcat, in, summary, float, complex, copyto!
+             unique, unique!, vcat, in, summary, float, complex, copyto!
 
 # Used for keyword argument default value
 _isordered(x::AbstractCategoricalArray) = isordered(x)
@@ -867,31 +867,36 @@ function levels!(A::CategoricalArray{T, N, R}, newlevels::AbstractVector;
     return A
 end
 
-function _unique(::Type{S},
-                 refs::AbstractArray{T},
-                 pool::CategoricalPool) where {S, T<:Integer}
-    nlevels = length(levels(pool)) + 1
-    order = fill(0, nlevels) # 0 indicates not seen
-    # If we don't track missings, short-circuit even if none has been seen
-    count = S >: Missing ? 0 : 1
-    @inbounds for i in refs
-        if order[i + 1] == 0
-            count += 1
-            order[i + 1] = count
-            count == nlevels && break
+# return unique refs (each value is unique) in the order of appearance in `refs`
+# equivalent to fallback Base.unique() implementation,
+# but short-circuits once references to all levels are encountered
+function _uniquerefs(A::CatArrOrSub{T}) where T
+    arefs = refs(A)
+    res = similar(arefs, 0)
+    nlevels = length(levels(A))
+    maxunique = nlevels + (T >: Missing ? 1 : 0)
+    seen = fill(false, nlevels + 1) # always +1 for 0 (missing ref)
+    @inbounds for ref in arefs
+        if !seen[ref + 1]
+            push!(res, ref)
+            seen[ref + 1] = true
+            (length(res) == maxunique) && break
         end
     end
-    S[i == 1 ? missing : levels(pool)[i - 1] for i in sortperm(order) if order[i] != 0]
+    return res
 end
 
-"""
-    unique(A::CategoricalArray)
+unique(A::CatArrOrSub{T}) where T =
+    CategoricalVector{T}(_uniquerefs(A), copy(pool(A)))
 
-Return levels which appear in `A` in their order of appearance.
-This function is significantly slower than [`levels`](@ref DataAPI.levels)
-since it needs to check whether levels are used or not.
-"""
-unique(A::CategoricalArray{T}) where {T} = _unique(T, A.refs, A.pool)
+function unique!(A::CategoricalVector)
+    urefs = _uniquerefs(A)
+    if length(urefs) != length(A)
+        resize!(A.refs, length(urefs))
+        copyto!(A.refs, urefs)
+    end
+    return A
+end
 
 """
     droplevels!(A::CategoricalArray)
diff --git a/src/subarray.jl b/src/subarray.jl
index 3e5f3f39..d7bf72df 100644
--- a/src/subarray.jl
+++ b/src/subarray.jl
@@ -5,13 +5,6 @@ isordered(sa::SubArray{T,N,P}) where {T,N,P<:CategoricalArray} = isordered(paren
 levels!(sa::SubArray{T,N,P}, newlevels::Vector) where {T,N,P<:CategoricalArray} =
     levels!(parent(sa), newlevels)
 
-function unique(sa::SubArray{T,N,P}) where {T,N,P<:CategoricalArray}
-    A = parent(sa)
-    refs = view(A.refs, sa.indices...)
-    S = eltype(P) >: Missing ? Union{eltype(levels(A.pool)), Missing} : eltype(levels(A.pool))
-    _unique(S, refs, A.pool)
-end
-
 refs(A::SubArray{<:Any, <:Any, <:CategoricalArray}) =
     view(parent(A).refs, parentindices(A)...)
 
diff --git a/test/11_array.jl b/test/11_array.jl
index 1edd2fef..4ac27b1b 100644
--- a/test/11_array.jl
+++ b/test/11_array.jl
@@ -16,6 +16,7 @@ using CategoricalArrays: DefaultRefType, leveltype
     @test isordered(x) === ordered
     @test levels(x) == sort(unique(a))
     @test unique(x) == unique(a)
+    @test typeof(unique(x)) === typeof(x)
     @test size(x) === (3,)
     @test length(x) === 3
 
@@ -272,6 +273,7 @@ using CategoricalArrays: DefaultRefType, leveltype
         @test x == collect(a)
         @test isordered(x) === ordered
         @test levels(x) == unique(x) == unique(a)
+        @test typeof(unique(x)) === typeof(x)
         @test size(x) === (4,)
         @test length(x) === 4
         @test leveltype(x) === Float64
@@ -437,6 +439,7 @@ using CategoricalArrays: DefaultRefType, leveltype
         @test x[4] === CategoricalValue(x.pool, 4)
         @test levels(x) == unique(a)
         @test unique(x) == unique(collect(x))
+        @test typeof(unique(x)) === typeof(x)
 
         x[1:2] .= -1
         @test x[1] === CategoricalValue(x.pool, 5)
@@ -473,6 +476,7 @@ using CategoricalArrays: DefaultRefType, leveltype
         @test x == a
         @test isordered(x) === ordered
         @test levels(x) == unique(x) == unique(a)
+        @test unique(x) isa CategoricalVector{String, R}
         @test size(x) === (2, 3)
         @test length(x) === 6
 
@@ -729,6 +733,7 @@ end
     @test levels!(x, ["Young", "Middle", "Old"]) === x
     @test levels(x) == ["Young", "Middle", "Old"]
     @test unique(x) == ["Old", "Young", "Middle"]
+    @test typeof(unique(x)) === typeof(x)
     @test levels!(x, ["Young", "Middle", "Old", "Unused"]) === x
     @test levels(x) == ["Young", "Middle", "Old", "Unused"]
     @test unique(x) == ["Old", "Young", "Middle"]
@@ -736,20 +741,34 @@ end
     @test levels(x) == ["Unused1", "Young", "Middle", "Old", "Unused2"]
     @test unique(x) == ["Old", "Young", "Middle"]
 
+    y = copy(x)
+    @test unique!(y) === y
+    @test y == unique(x)
+
     x = CategoricalArray(String[])
     @test isa(levels(x), Vector{String}) && isempty(levels(x))
-    @test isa(unique(x), Vector{String}) && isempty(unique(x))
+    @test isa(unique(x), typeof(x)) && isempty(unique(x))
     @test levels!(x, ["Young", "Middle", "Old"]) === x
     @test levels(x) == ["Young", "Middle", "Old"]
-    @test isa(unique(x), Vector{String}) && isempty(unique(x))
+    @test isa(unique(x), typeof(x)) && isempty(unique(x))
+
+    y = copy(x)
+    @test unique!(y) === y
+    @test y == unique(x)
 
     # To test short-circuiting
     x = CategoricalArray(repeat(1:10, inner=10))
     @test levels(x) == collect(1:10)
     @test unique(x) == collect(1:10)
+    @test unique(x) isa typeof(x)
     @test levels!(x, [19:-1:1; 20]) === x
     @test levels(x) == [19:-1:1; 20]
     @test unique(x) == collect(1:10)
+    @test unique(x) isa typeof(x)
+
+    y = copy(x)
+    @test unique!(y) === y
+    @test y == 1:10
 end
 
 end
diff --git a/test/12_missingarray.jl b/test/12_missingarray.jl
index fea335c2..a2204e40 100644
--- a/test/12_missingarray.jl
+++ b/test/12_missingarray.jl
@@ -19,9 +19,14 @@ const ≅ = isequal
                 @test isordered(x) === ordered
                 @test levels(x) == sort(unique(a))
                 @test unique(x) == unique(a)
+                @test typeof(unique(x)) === typeof(x)
                 @test size(x) === (3,)
                 @test length(x) === 3
 
+                y = copy(x)
+                @test y === unique!(y)
+                @test y == unique(x)
+
                 @test convert(CategoricalArray, x) === x
                 @test convert(CategoricalArray{Union{String, Missing}}, x) === x
                 @test convert(CategoricalArray{Union{String, Missing}, 1}, x) === x
@@ -296,6 +301,7 @@ const ≅ = isequal
             @test x ≅ a
             @test levels(x) == filter(x->!ismissing(x), unique(a))
             @test unique(x) ≅ unique(a)
+            @test typeof(unique(x)) === typeof(x)
             @test size(x) === (3,)
             @test length(x) === 3
 
@@ -440,6 +446,7 @@ const ≅ = isequal
         @test x == collect(a)
         @test isordered(x) === ordered
         @test levels(x) == unique(x) == unique(a)
+        @test typeof(unique(x)) === typeof(x)
         @test size(x) === (4,)
         @test length(x) === 4
         @test leveltype(x) === Float64
@@ -616,6 +623,7 @@ const ≅ = isequal
         @test x[4] === CategoricalValue(x.pool, 4)
         @test levels(x) == unique(a)
         @test unique(x) == unique(collect(x))
+        @test typeof(unique(x)) === typeof(x)
 
         x[1:2] .= -1
         @test x[1] === CategoricalValue(x.pool, 5)
@@ -625,6 +633,7 @@ const ≅ = isequal
         @test isordered(x) === false
         @test levels(x) == vcat(unique(a), -1)
         @test unique(x) == unique(collect(x))
+        @test typeof(unique(x)) === typeof(x)
 
 
         ordered!(x, ordered)
@@ -656,6 +665,7 @@ const ≅ = isequal
         @test x == a
         @test isordered(x) === ordered
         @test levels(x) == unique(x) == unique(a)
+        @test unique(x) isa CategoricalVector{Union{String, Missing}, R}
         @test size(x) === (2, 3)
         @test length(x) === 6
 
@@ -816,6 +826,7 @@ const ≅ = isequal
         @test isordered(x) === ordered
         @test levels(x) == filter(x->!ismissing(x), unique(a))
         @test unique(x) ≅ unique(a)
+        @test unique(x) isa CategoricalVector{Union{String, Missing}, R}
         @test size(x) === (2, 3)
         @test length(x) === 6
 
@@ -1137,6 +1148,7 @@ end
     x = CategoricalArray(["Old", "Young", "Middle", missing, "Young"])
     @test levels(x) == ["Middle", "Old", "Young"]
     @test unique(x) ≅ ["Old", "Young", "Middle", missing]
+    @test typeof(unique(x)) === typeof(x)
     @test levels!(x, ["Young", "Middle", "Old"]) === x
     @test levels(x) == ["Young", "Middle", "Old"]
     @test unique(x) ≅ ["Old", "Young", "Middle", missing]

From a7ccfd5ca6de9f1fa16a8b4f2fd272fbbc5f1418 Mon Sep 17 00:00:00 2001
From: Milan Bouchet-Valat <nalimilan@club.fr>
Date: Thu, 1 May 2025 00:50:27 +0200
Subject: [PATCH 14/25] Support reading from and writing to Arrow files (#415)

This requires overriding `Arrow.DictEncoding` so that an `Arrow.DictEncoded`
with a `CategoricalArray` dictionary with one entry per level is created.
This is the only way to ensure that indexing the Arrow column gives
`CategoricalValue` objects. In practice such columns will most often
be used after conversion to `CategoricalArray` via `copy`, `DataFrame`, etc.

Apparently, pandas do not allow reading the resulting file if the array
allows for missing values as it does not accept `missing` in the dictionary.
Instead it would need missing entries to be coded via null indices, which
is less efficient.

Require Julia 1.6 as tests fail on older Julia versions.
---
 .github/workflows/ci.yml         |  2 +-
 Project.toml                     |  8 +++-
 ext/CategoricalArraysArrowExt.jl | 70 ++++++++++++++++++++++++++++++++
 src/CategoricalArrays.jl         |  1 +
 test/13_arraycommon.jl           | 53 ++++++++++++++++++++++++
 5 files changed, 131 insertions(+), 3 deletions(-)
 create mode 100644 ext/CategoricalArraysArrowExt.jl

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 1fb7fb41..c59b0c53 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -12,7 +12,7 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.0'
+          - '1.6'
           - '1' # automatically expands to the latest stable 1.x release of Julia
           - 'nightly'
         os:
diff --git a/Project.toml b/Project.toml
index 4593b00b..adbb8789 100644
--- a/Project.toml
+++ b/Project.toml
@@ -13,18 +13,21 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
 
 [weakdeps]
+Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
 SentinelArrays = "91c51154-3ec4-41a3-a24f-3f23e20d615c"
 StructTypes = "856f2bd8-1eba-4b0a-8007-ebc267875bd4"
 
 [extensions]
+CategoricalArraysArrowExt = "Arrow"
 CategoricalArraysJSONExt = "JSON"
 CategoricalArraysRecipesBaseExt = "RecipesBase"
 CategoricalArraysSentinelArraysExt = "SentinelArrays"
 CategoricalArraysStructTypesExt = "StructTypes"
 
 [compat]
+Arrow = "2"
 Compat = "3.37, 4"
 DataAPI = "1.6"
 JSON = "0.15, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21"
@@ -35,9 +38,10 @@ Requires = "1"
 SentinelArrays = "1"
 Statistics = "1"
 StructTypes = "1"
-julia = "1"
+julia = "1.6"
 
 [extras]
+Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
@@ -49,4 +53,4 @@ StructTypes = "856f2bd8-1eba-4b0a-8007-ebc267875bd4"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Dates", "JSON", "JSON3", "Plots", "PooledArrays", "RecipesBase", "SentinelArrays", "StructTypes", "Test"]
+test = ["Arrow", "Dates", "JSON", "JSON3", "Plots", "PooledArrays", "RecipesBase", "SentinelArrays", "StructTypes", "Test"]
diff --git a/ext/CategoricalArraysArrowExt.jl b/ext/CategoricalArraysArrowExt.jl
new file mode 100644
index 00000000..3e764122
--- /dev/null
+++ b/ext/CategoricalArraysArrowExt.jl
@@ -0,0 +1,70 @@
+module CategoricalArraysArrowExt
+
+using CategoricalArrays
+import Arrow
+import Arrow: ArrowTypes
+
+const CATARRAY_ARROWNAME = Symbol("JuliaLang.CategoricalArrays.CategoricalArray")
+ArrowTypes.arrowname(::Type{<:CategoricalValue}) = CATARRAY_ARROWNAME
+ArrowTypes.arrowmetadata(::Type{CategoricalValue{T, R}}) where {T, R} = string(R)
+
+ArrowTypes.arrowname(::Type{Union{<:CategoricalValue, Missing}}) = CATARRAY_ARROWNAME
+ArrowTypes.arrowmetadata(::Type{Union{CategoricalValue{T, R}, Missing}}) where {T, R} =
+    string(R)
+
+const REFTYPES = Dict(string(T) => T for T in (Int128, Int16, Int32, Int64, Int8, UInt128,
+                                               UInt16, UInt32, UInt64, UInt8))
+function ArrowTypes.JuliaType(::Val{CATARRAY_ARROWNAME},
+                              ::Type{S}, meta::String) where S
+    R = REFTYPES[meta]
+    return CategoricalValue{S, R}
+end
+
+for (MV, MT) in ((:V, :T), (:(Union{V,Missing}), :(Union{T,Missing})))
+    @eval begin
+        function Arrow.DictEncoding{$MV,S,A}(id, data::Arrow.List{U, O, B},
+                                             isOrdered, metadata) where
+            {T, R, V<:CategoricalValue{T,R}, S, O, A, B, U}
+            newdata = Arrow.List{$MT,O,B}(data.arrow, data.validity, data.offsets,
+                                          data.data, data.ℓ, data.metadata)
+            levels = Missing <: $MT ? collect(skipmissing(newdata)) : newdata
+            catdata = CategoricalVector{$MT,R}(newdata, levels=levels)
+            return Arrow.DictEncoding{$MV,S,typeof(catdata)}(id, catdata,
+                                                             isOrdered, metadata)
+        end
+
+        function Arrow.DictEncoding{$MV,S,A}(id, data::Arrow.Primitive{U, B},
+                                            isOrdered, metadata) where
+            {T, R, V<:CategoricalValue{T,R}, S, A, B, U}
+            newdata = Arrow.Primitive{$MT,B}(data.arrow, data.validity, data.data,
+                                            data.ℓ, data.metadata)
+            levels = Missing <: $MT ? collect(skipmissing(newdata)) : newdata
+            catdata = CategoricalVector{$MT,R}(newdata, levels=levels)
+            return Arrow.DictEncoding{$MV,S,typeof(catdata)}(id, catdata,
+                                                             isOrdered, metadata)
+        end
+    end
+end
+
+function Base.copy(x::Arrow.DictEncoded{V}) where {T, R, V<:CategoricalValue{T, R}}
+    pool = CategoricalPool{T,R}(x.encoding.data)
+    inds = x.indices
+    refs = similar(inds, R)
+    refs .= inds .+ one(R)
+    return CategoricalVector{T}(refs, pool)
+end
+
+function Base.copy(x::Arrow.DictEncoded{Union{Missing,V}}) where
+    {T, R, V<:CategoricalValue{T, R}}
+    ismissing(x.encoding.data[1]) ||
+        throw(ErrorException("`missing` must be the first value in a " *
+                             "`CategoricalArray` pool"))
+    levels = collect(skipmissing(x.encoding.data))
+    pool = CategoricalPool{T,R}(levels)
+    inds = x.indices
+    refs = similar(inds, R)
+    refs .= inds
+    return CategoricalVector{Union{T,Missing}}(refs, pool)
+end
+
+end
diff --git a/src/CategoricalArrays.jl b/src/CategoricalArrays.jl
index a28cba94..8f511677 100644
--- a/src/CategoricalArrays.jl
+++ b/src/CategoricalArrays.jl
@@ -41,6 +41,7 @@ module CategoricalArrays
 
     @static if !isdefined(Base, :get_extension)
         function __init__()
+            @require Arrow="69666777-d1a9-59fb-9406-91d4454c9d45" include("../ext/CategoricalArraysArrowExt.jl")
             @require JSON="682c06a0-de6a-54ab-a142-c8b1cf79cde6" include("../ext/CategoricalArraysJSONExt.jl")
             @require RecipesBase="3cdcf5f2-1ef4-517c-9805-6587b60abb01" include("../ext/CategoricalArraysRecipesBaseExt.jl")
             @require SentinelArrays="91c51154-3ec4-41a3-a24f-3f23e20d615c" include("../ext/CategoricalArraysSentinelArraysExt.jl")
diff --git a/test/13_arraycommon.jl b/test/13_arraycommon.jl
index 20d61ef0..4d7c5279 100644
--- a/test/13_arraycommon.jl
+++ b/test/13_arraycommon.jl
@@ -10,6 +10,8 @@ using StructTypes
 using RecipesBase
 using Plots
 using SentinelArrays
+using Arrow
+using Missings
 
 const ≅ = isequal
 const ≇ = !isequal
@@ -2071,6 +2073,57 @@ StructTypes.StructType(::Type{<:MyCustomType}) = StructTypes.Struct()
     @test levels(readx.var) == levels(x.var)
 end
 
+if Int == Int64
+    @testset "writing and reading Arrow files" for f in (identity, passmissing(string))
+        xref = f.([3, 1, 4, 1, 4])
+        x = categorical(f.([3, 1, 4, 1, 4]))
+        tbl = mktemp() do path, io
+            Arrow.write(path, (x=x,))
+            Arrow.Table(path)
+        end
+        @test tbl.x == x
+        @test tbl.x isa Arrow.DictEncoded{CategoricalValue{eltype(xref), UInt32}, Int8,
+                                          <: CategoricalVector{eltype(xref), UInt32}}
+        @test copy(tbl.x) == x
+        @test copy(x) isa CategoricalArray{eltype(xref),1,UInt32}
+
+        x = categorical(f.([3, 1, 4, 1, 4]), compress=true)
+        tbl = mktemp() do path, io
+            Arrow.write(path, (x=x,))
+            Arrow.Table(path)
+        end
+        @test tbl.x == x
+        @test tbl.x isa Arrow.DictEncoded{CategoricalValue{eltype(xref), UInt8}, Int8,
+                                          <: CategoricalVector{eltype(xref), UInt8}}
+        @test copy(tbl.x) == x
+        @test copy(x) isa CategoricalArray{eltype(xref),1,UInt8}
+
+        x = categorical(recode(xref, 1 => missing))
+        tbl = mktemp() do path, io
+            Arrow.write(path, (x=x,))
+            Arrow.Table(path)
+        end
+        @test tbl.x ≅ x
+        @test tbl.x isa Arrow.DictEncoded{Union{CategoricalValue{eltype(xref), UInt32}, Missing},
+                                          Int8,
+                                          <: CategoricalVector{Union{eltype(xref), Missing},
+                                                               UInt32}}
+        @test copy(tbl.x) ≅ x
+        @test copy(x) isa CategoricalArray{Union{eltype(xref), Missing},1,UInt32}
+
+        recode!(x, missing => f(1))
+        tbl = mktemp() do path, io
+            Arrow.write(path, (x=x,))
+            Arrow.Table(path)
+        end
+        @test tbl.x == x
+        @test tbl.x isa Arrow.DictEncoded{Union{CategoricalValue{eltype(xref), UInt32}, Missing}, Int8,
+                                          <: CategoricalVector{Union{eltype(xref), Missing}, UInt32}}
+        @test copy(tbl.x) == x
+        @test copy(x) isa CategoricalArray{Union{eltype(xref), Missing},1,UInt32}
+    end
+end
+
 @testset "refarray, refvalue, refpool, and invrefpool" begin
     for y in (categorical(["b", "a", "c", "b"]),
               view(categorical(["a", "a", "c", "b"]), 1:3),

From dc83e6af8e1e9f8063e63c7c347c247205c5e6b9 Mon Sep 17 00:00:00 2001
From: Andreas Noack <andreas@noack.dk>
Date: Sat, 17 May 2025 16:49:04 +0200
Subject: [PATCH 15/25] Test on min, lts, and pre instead of 1.6 and nightly

---
 .github/workflows/ci.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c59b0c53..00903e16 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -12,9 +12,10 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.6'
+          - 'min'
+          - 'lts'
           - '1' # automatically expands to the latest stable 1.x release of Julia
-          - 'nightly'
+          - 'pre'
         os:
           - ubuntu-latest
           - macOS-latest

From adbd741fa9c43344ac2154e95d2a0813bd093cd2 Mon Sep 17 00:00:00 2001
From: Andreas Noack <andreas@noack.dk>
Date: Sat, 17 May 2025 16:49:33 +0200
Subject: [PATCH 16/25] Only test on Linux

The package doesn't have any binary dependendencies so there is
not much value in testing on all platforms except sometimes
detecting issues in test dependencies.
---
 .github/workflows/ci.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 00903e16..f7778286 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -18,8 +18,6 @@ jobs:
           - 'pre'
         os:
           - ubuntu-latest
-          - macOS-latest
-          - windows-latest
         arch:
           - x64
           - x86

From 5d0f595289f9641c74ce3376973a55fcee5e82be Mon Sep 17 00:00:00 2001
From: Milan Bouchet-Valat <nalimilan@club.fr>
Date: Sun, 18 May 2025 11:22:48 +0200
Subject: [PATCH 17/25] Use RecipesPipeline instead of Plots

---
 Project.toml           | 4 ++--
 test/13_arraycommon.jl | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Project.toml b/Project.toml
index adbb8789..4de04411 100644
--- a/Project.toml
+++ b/Project.toml
@@ -45,12 +45,12 @@ Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
-Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
 RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
+RecipesPipeline = "01d81517-befc-4cb6-b9ec-a95719d0359c"
 SentinelArrays = "91c51154-3ec4-41a3-a24f-3f23e20d615c"
 StructTypes = "856f2bd8-1eba-4b0a-8007-ebc267875bd4"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Arrow", "Dates", "JSON", "JSON3", "Plots", "PooledArrays", "RecipesBase", "SentinelArrays", "StructTypes", "Test"]
+test = ["Arrow", "Dates", "JSON", "JSON3", "PooledArrays", "RecipesBase", "RecipesPipeline", "SentinelArrays", "StructTypes", "Test"]
diff --git a/test/13_arraycommon.jl b/test/13_arraycommon.jl
index 4d7c5279..2fb369c9 100644
--- a/test/13_arraycommon.jl
+++ b/test/13_arraycommon.jl
@@ -8,7 +8,7 @@ using PooledArrays
 using JSON3
 using StructTypes
 using RecipesBase
-using Plots
+using RecipesPipeline
 using SentinelArrays
 using Arrow
 using Missings

From e91470442d483f1ad1a2d2b988ec07341804b16e Mon Sep 17 00:00:00 2001
From: Andreas Noack <andreas@noack.dk>
Date: Sun, 18 May 2025 20:05:07 +0200
Subject: [PATCH 18/25] Continue with '1.6' instead of 'min' for now because of
 a method ambiguity in Arrow on 1.6.0.

---
 .github/workflows/ci.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index f7778286..aee70898 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -12,7 +12,10 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - 'min'
+          # FIXME! Switch from 1.6 to 'min' once we require a higher minimum
+          # We can't switch yet as there is a method ambiguity for a depndency
+          # in version 1.6.0.
+          - '1.6'
           - 'lts'
           - '1' # automatically expands to the latest stable 1.x release of Julia
           - 'pre'

From 07b955f6c47d98d132df707acc8662c1729549db Mon Sep 17 00:00:00 2001
From: Milan Bouchet-Valat <nalimilan@club.fr>
Date: Mon, 19 May 2025 09:44:12 +0200
Subject: [PATCH 19/25] Remove deprecations (#419)

In preparation of 1.0. These have been in place for years,
it's unlikely people rely on them ayway.
---
 src/CategoricalArrays.jl |  2 --
 src/array.jl             |  8 +-------
 src/deprecated.jl        | 18 ------------------
 src/extras.jl            | 12 ------------
 test/05_convert.jl       |  6 +++---
 test/11_array.jl         |  2 +-
 test/13_arraycommon.jl   |  2 +-
 test/17_deprecated.jl    | 16 ----------------
 test/runtests.jl         |  3 +--
 9 files changed, 7 insertions(+), 62 deletions(-)
 delete mode 100644 src/deprecated.jl
 delete mode 100644 test/17_deprecated.jl

diff --git a/src/CategoricalArrays.jl b/src/CategoricalArrays.jl
index 8f511677..e597c344 100644
--- a/src/CategoricalArrays.jl
+++ b/src/CategoricalArrays.jl
@@ -33,8 +33,6 @@ module CategoricalArrays
     include("extras.jl")
     include("recode.jl")
 
-    include("deprecated.jl")
-
     if !isdefined(Base, :get_extension)
         using Requires: @require
     end
diff --git a/src/array.jl b/src/array.jl
index 04c9ea56..c462e7d4 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -790,13 +790,7 @@ entries corresponding to omitted levels will be set to `missing`.
 Else, `newlevels` must include all levels which appear in the data.
 """
 function levels!(A::CategoricalArray{T, N, R}, newlevels::AbstractVector;
-                 allowmissing::Bool=false,
-                 allow_missing::Union{Bool, Nothing}=nothing) where {T, N, R}
-    if allow_missing !== nothing
-        Base.depwarn("allow_missing argument is deprecated, use allowmissing instead",
-                     :levels!)
-        allowmissing = allow_missing
-    end
+                 allowmissing::Bool=false) where {T, N, R}
     (levels(A) == newlevels) && return A # nothing to do
 
     # map each new level to its ref code
diff --git a/src/deprecated.jl b/src/deprecated.jl
deleted file mode 100644
index 667b2923..00000000
--- a/src/deprecated.jl
+++ /dev/null
@@ -1,18 +0,0 @@
-function index(pool::CategoricalPool)
-    throw(ErrorException("CategoricalArrays.index(pool::CategoricalPool) is deprecated: " *
-                         "use levels(pool) instead"))
-end
-function order(pool::CategoricalPool)
-    throw(ErrorException("CategoricalArrays.index(pool::CategoricalPool) is deprecated: " *
-                         "use 1:length(levels(pool)) instead"))
-end
-
-function categorical(A::AbstractArray, compress::Bool; kwargs...)
-    throw(ErrorException("categorical(A::AbstractArray, compress, kwargs...) is deprecated: " *
-                         "use categorical(A, compress=compress, kwargs...) instead."))
-end
-
-import Base: get
-
-@deprecate get(x::CategoricalValue) DataAPI.unwrap(x)
-@deprecate CategoricalValue(i::Integer, pool::CategoricalPool) pool[i]
\ No newline at end of file
diff --git a/src/extras.jl b/src/extras.jl
index 2afcef38..b806a9f2 100644
--- a/src/extras.jl
+++ b/src/extras.jl
@@ -114,19 +114,7 @@ julia> cut(-1:0.5:1, 3, labels=fmt)
 @inline function cut(x::AbstractArray, breaks::AbstractVector;
                      extend::Union{Bool, Missing}=false,
                      labels::Union{AbstractVector{<:SupportedTypes},Function}=default_formatter,
-                     allowmissing::Union{Bool, Nothing}=nothing,
-                     allow_missing::Union{Bool, Nothing}=nothing,
                      allowempty::Bool=false)
-    if allow_missing !== nothing
-        Base.depwarn("allow_missing argument is deprecated, use extend=missing instead",
-                     :cut)
-        extend = missing
-    end
-    if allowmissing !== nothing
-        Base.depwarn("allowmissing argument is deprecated, use extend=missing instead",
-                     :cut)
-         extend = missing
-    end
     return _cut(x, breaks, extend, labels, allowempty)
 end
 
diff --git a/test/05_convert.jl b/test/05_convert.jl
index b9b93544..3e7c98be 100644
--- a/test/05_convert.jl
+++ b/test/05_convert.jl
@@ -55,9 +55,9 @@ using CategoricalArrays: DefaultRefType, refcode, reftype, leveltype
         @test convert(Union{T, U}, v3)::T == v3
     end
 
-    @test unwrap(v1) === get(v1) === 1
-    @test unwrap(v2) === get(v2) === 2
-    @test unwrap(v3) === get(v3) === 3
+    @test unwrap(v1) === 1
+    @test unwrap(v2) === 2
+    @test unwrap(v3) === 3
 
     @test promote(1, v1) === (1, 1)
     @test promote(1.0, v1) === (1.0, 1.0)
diff --git a/test/11_array.jl b/test/11_array.jl
index 4ac27b1b..b474cfe1 100644
--- a/test/11_array.jl
+++ b/test/11_array.jl
@@ -719,7 +719,7 @@ using CategoricalArrays: DefaultRefType, leveltype
         @test levels(x) == ["c", "a", "b"]
 
         ordered!(x, ordered)
-        v = CategoricalValue(2, CategoricalPool(["xyz", "b"]))
+        v = CategoricalValue(CategoricalPool(["xyz", "b"]), 2)
         x[1] = v
         @test x[1] === CategoricalValue(x.pool, 4)
         @test x[2] === CategoricalValue(x.pool, 1)
diff --git a/test/13_arraycommon.jl b/test/13_arraycommon.jl
index 2fb369c9..02b51bd7 100644
--- a/test/13_arraycommon.jl
+++ b/test/13_arraycommon.jl
@@ -893,7 +893,7 @@ end
             @test sort(cv, rev=rev, by=byf1) ≅ sort(cv, rev=rev, by=byf1)
 
             # Check that by function is not called on unused levels/missing
-            byf2 = x -> (@assert get(x) != "b"; x)
+            byf2 = x -> (@assert x != "b"; x)
             replace!(cv, missing=>"a", "b"=>"a")
             @test sort(cv, rev=rev, by=byf2) ≅ sort(cv, rev=rev, by=byf2)
         end
diff --git a/test/17_deprecated.jl b/test/17_deprecated.jl
deleted file mode 100644
index d5a08ff4..00000000
--- a/test/17_deprecated.jl
+++ /dev/null
@@ -1,16 +0,0 @@
-module TestExtras
-using Test
-using CategoricalArrays
-
-const ≅ = isequal
-
-@testset "allow_missing argument" begin
-    x = categorical(["a", "b", missing])
-    levels!(x, ["a"], allow_missing=true)
-    @test x ≅ ["a", missing, missing]
-
-    x = cut([1, missing, 100], [1, 2], allow_missing=true)
-    @test x ≅ ["[1, 2]", missing, missing]
-end
-
-end
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index e59180e7..088cfc9e 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -27,8 +27,7 @@ module TestCategoricalArrays
         "13_arraycommon.jl",
         "14_view.jl",
         "15_extras.jl",
-        "16_recode.jl",
-        "17_deprecated.jl"
+        "16_recode.jl"
     ]
 
     @testset "$test" for test in tests

From b16588b9d392ca612b4c4b140acc5f54f7d4e479 Mon Sep 17 00:00:00 2001
From: Milan Bouchet-Valat <nalimilan@club.fr>
Date: Wed, 21 May 2025 18:23:39 +0200
Subject: [PATCH 20/25] Choose different quantile cutpoints in `cut(x, n)`
 (#416)

`Statistics.quantile` returns values which are not the most appropriate
to generate labels. It is more intuitive to choose values from the actual data,
which are likely to have fewer decimals and make more sense for users.
Since intervals are closed on the left, we just have to use the value right
below the quantile. This doesn't change group assignments (only labels).
---
 src/extras.jl     | 55 ++++++++++++++++++++++++++++++++++-------------
 test/15_extras.jl | 50 ++++++++++++++++++++++++++++--------------
 2 files changed, 74 insertions(+), 31 deletions(-)

diff --git a/src/extras.jl b/src/extras.jl
index b806a9f2..3f27aba6 100644
--- a/src/extras.jl
+++ b/src/extras.jl
@@ -42,8 +42,8 @@ default_formatter(from, to, i; leftclosed, rightclosed) =
 
 Cut a numeric array into intervals at values `breaks`
 and return an ordered `CategoricalArray` indicating
-the interval into which each entry falls. Intervals are of the form `[lower, upper)`,
-i.e. the lower bound is included and the upper bound is excluded, except
+the interval into which each entry falls. Intervals are of the form `[lower, upper)`
+(closed on the left), i.e. the lower bound is included and the upper bound is excluded, except
 the last interval, which is closed on both ends, i.e. `[lower, upper]`.
 
 If `x` accepts missing values (i.e. `eltype(x) >: Missing`) the returned array will
@@ -81,7 +81,7 @@ julia> cut(-1:0.5:1, 2)
  "Q1: [-1.0, 0.0)"
  "Q2: [0.0, 1.0]"
  "Q2: [0.0, 1.0]"
- "Q2: [0.0, 1.0]" 
+ "Q2: [0.0, 1.0]"
 
 julia> cut(-1:0.5:1, 2, labels=["A", "B"])
 5-element CategoricalArray{String,1,UInt32}:
@@ -89,7 +89,7 @@ julia> cut(-1:0.5:1, 2, labels=["A", "B"])
  "A"
  "B"
  "B"
- "B" 
+ "B"
 
 julia> cut(-1:0.5:1, 2, labels=[-0.5, +0.5])
 5-element CategoricalArray{Float64,1,UInt32}:
@@ -104,11 +104,11 @@ fmt (generic function with 1 method)
 
 julia> cut(-1:0.5:1, 3, labels=fmt)
 5-element CategoricalArray{String,1,UInt32}:
- "grp 1 (-1.0//-0.3333333333333335)"
- "grp 1 (-1.0//-0.3333333333333335)"
- "grp 2 (-0.3333333333333335//0.33333333333333326)"
- "grp 3 (0.33333333333333326//1.0)"
- "grp 3 (0.33333333333333326//1.0)"
+ "grp 1 (-1.0//0.0)"
+ "grp 1 (-1.0//0.0)"
+ "grp 2 (0.0//0.5)"
+ "grp 3 (0.5//1.0)"
+ "grp 3 (0.5//1.0)"
 ```
 """
 @inline function cut(x::AbstractArray, breaks::AbstractVector;
@@ -221,12 +221,38 @@ Provide the default label format for the `cut(x, ngroups)` method.
 quantile_formatter(from, to, i; leftclosed, rightclosed) =
     string("Q", i, ": ", leftclosed ? "[" : "(", from, ", ", to, rightclosed ? "]" : ")")
 
+"""
+Find first value in (sorted) `v` which is greater than or equal to each quantile
+in (sorted) `qs`.
+"""
+function find_breaks(v::AbstractVector, qs::AbstractVector)
+    n = length(qs)
+    breaks = similar(v, n)
+    n == 0 && return breaks
+
+    i = 1
+    q = qs[1]
+    @inbounds for x in v
+        # Use isless and isequal to differentiate -0.0 from 0.0
+        if isless(q, x) || isequal(q, x)
+            breaks[i] = x
+            i += 1
+            i > n && break
+            q = qs[i]
+        end
+    end
+    return breaks
+end
+
 """
     cut(x::AbstractArray, ngroups::Integer;
         labels::Union{AbstractVector{<:AbstractString},Function},
         allowempty::Bool=false)
 
-Cut a numeric array into `ngroups` quantiles, determined using `quantile`.
+Cut a numeric array into `ngroups` quantiles.
+
+This is equivalent to `cut(x, quantile(x, (0:ngroups)/ngroups))`,
+but breaks are taken from actual data values instead of estimated quantiles.
 
 If `x` contains `missing` values, they are automatically skipped when computing
 quantiles.
@@ -246,15 +272,14 @@ function cut(x::AbstractArray, ngroups::Integer;
              labels::Union{AbstractVector{<:SupportedTypes},Function}=quantile_formatter,
              allowempty::Bool=false)
     ngroups >= 1 || throw(ArgumentError("ngroups must be strictly positive (got $ngroups)"))
-    xnm = eltype(x) >: Missing ? skipmissing(x) : x
-    # Computing extrema is faster than taking 0 and 1 quantiles
-    min_x, max_x = extrema(xnm)
+    sorted_x = eltype(x) >: Missing ? sort!(collect(skipmissing(x))) : sort(x)
+    min_x, max_x = first(sorted_x), last(sorted_x)
     if (min_x isa Number && isnan(min_x)) ||
         (max_x isa Number && isnan(max_x))
         throw(ArgumentError("NaN values are not allowed in input vector"))
     end
-    breaks = quantile(xnm, (1:ngroups-1)/ngroups)
-    breaks = [min_x; breaks; max_x]
+    qs = quantile!(sorted_x, (1:(ngroups-1))/ngroups, sorted=true)
+    breaks = [min_x; find_breaks(sorted_x, qs); max_x]
     if !allowempty && !allunique(@view breaks[1:end-1])
         throw(ArgumentError("cannot compute $ngroups quantiles due to " *
                             "too many duplicated values in `x`. " *
diff --git a/test/15_extras.jl b/test/15_extras.jl
index 1aaf8dc7..af4f79f5 100644
--- a/test/15_extras.jl
+++ b/test/15_extras.jl
@@ -127,18 +127,18 @@ end
 
 @testset "cut([5, 4, 3, 2], 2)" begin
     x = @inferred cut([5, 4, 3, 2], 2)
-    @test x == ["Q2: [3.5, 5.0]", "Q2: [3.5, 5.0]", "Q1: [2.0, 3.5)", "Q1: [2.0, 3.5)"]
+    @test x == ["Q2: [4, 5]", "Q2: [4, 5]", "Q1: [2, 4)", "Q1: [2, 4)"]
     @test isa(x, CategoricalArray)
     @test isordered(x)
-    @test levels(x) == ["Q1: [2.0, 3.5)", "Q2: [3.5, 5.0]"]
+    @test levels(x) == ["Q1: [2, 4)", "Q2: [4, 5]"]
 end
 
 @testset "cut(x, n) with missing values" begin
     x = @inferred cut([5, 4, 3, missing, 2], 2)
-    @test x ≅ ["Q2: [3.5, 5.0]", "Q2: [3.5, 5.0]", "Q1: [2.0, 3.5)", missing, "Q1: [2.0, 3.5)"]
+    @test x ≅ ["Q2: [4, 5]", "Q2: [4, 5]", "Q1: [2, 4)", missing, "Q1: [2, 4)"]
     @test isa(x, CategoricalArray)
     @test isordered(x)
-    @test levels(x) == ["Q1: [2.0, 3.5)", "Q2: [3.5, 5.0]"]
+    @test levels(x) == ["Q1: [2, 4)", "Q2: [4, 5]"]
 end
 
 @testset "cut(x, n) with invalid n" begin
@@ -255,20 +255,29 @@ end
     @test_throws ArgumentError cut(1:8, 0:2:10, labels=fmt)
 
     @test_throws ArgumentError cut([fill(1, 10); 4], 2)
-    @test_throws ArgumentError cut([fill(1, 10); 4], 3)
     x = cut([fill(1, 10); 4], 2, allowempty=true)
-    @test unique(x) == ["Q2: [1.0, 4.0]"]
+    @test unique(x) == ["Q2: [1, 4]"]
+    @test levels(x) == ["Q1: (1, 1)", "Q2: [1, 4]"]
+    @test_throws ArgumentError cut([fill(1, 10); 4], 3)
     x = cut([fill(1, 10); 4], 3, allowempty=true)
-    @test unique(x) == ["Q3: [1.0, 4.0]"]
-    @test levels(x) == ["Q1: (1.0, 1.0)", "Q2: (1.0, 1.0)", "Q3: [1.0, 4.0]"]
+    @test unique(x) == ["Q3: [1, 4]"]
+    @test levels(x) == ["Q1: (1, 1)", "Q2: (1, 1)", "Q3: [1, 4]"]
+
+    x = cut([fill(4, 10); 1], 2)
+    @test x == [fill("Q2: [4, 4]", 10); "Q1: [1, 4)"]
+    @test levels(x) == ["Q1: [1, 4)"; "Q2: [4, 4]"]
+    @test_throws ArgumentError cut([fill(4, 10); 1], 3)
+    x = cut([fill(4, 10); 1], 3, allowempty=true)
+    @test x == [fill("Q3: [4, 4]", 10); "Q1: [1, 4)"]
+    @test levels(x) == ["Q1: [1, 4)", "Q2: (4, 4)", "Q3: [4, 4]"]
 
     x = cut([fill(1, 5); fill(4, 5)], 2)
-    @test x == [fill("Q1: [1.0, 2.5)", 5); fill("Q2: [2.5, 4.0]", 5)]
-    @test levels(x) == ["Q1: [1.0, 2.5)", "Q2: [2.5, 4.0]"]
+    @test x == [fill("Q1: [1, 4)", 5); fill("Q2: [4, 4]", 5)]
+    @test levels(x) == ["Q1: [1, 4)", "Q2: [4, 4]"]
     @test_throws ArgumentError  cut([fill(1, 5); fill(4, 5)], 3)
     x = cut([fill(1, 5); fill(4, 5)], 3, allowempty=true)
-    @test x == [fill("Q2: [1.0, 4.0)", 5); fill("Q3: [4.0, 4.0]", 5)]
-    @test levels(x) == ["Q1: (1.0, 1.0)", "Q2: [1.0, 4.0)", "Q3: [4.0, 4.0]"]
+    @test x == [fill("Q2: [1, 4)", 5); fill("Q3: [4, 4]", 5)]
+    @test levels(x) == ["Q1: (1, 1)", "Q2: [1, 4)", "Q3: [4, 4]"]
 end
 
 @testset "cut with -0.0" begin
@@ -353,12 +362,21 @@ end
     @test levels(x) == ["[-Inf, 2.0)", "[2.0, 5.0]"]
 
     x = cut([1:5; Inf], 2)
-    @test x ≅ [fill("Q1: [1.0, 3.5)", 3); fill("Q2: [3.5, Inf]", 3)]
-    @test levels(x) == ["Q1: [1.0, 3.5)", "Q2: [3.5, Inf]"]
+    @test x ≅ [fill("Q1: [1.0, 4.0)", 3); fill("Q2: [4.0, Inf]", 3)]
+    @test levels(x) == ["Q1: [1.0, 4.0)", "Q2: [4.0, Inf]"]
 
     x = cut([1:5; -Inf], 2)
-    @test x ≅ [fill("Q1: [-Inf, 2.5)", 2); fill("Q2: [2.5, 5.0]", 3); "Q1: [-Inf, 2.5)"]
-    @test levels(x) == ["Q1: [-Inf, 2.5)", "Q2: [2.5, 5.0]"]
+    @test x ≅ [fill("Q1: [-Inf, 3.0)", 2); fill("Q2: [3.0, 5.0]", 3); "Q1: [-Inf, 3.0)"]
+    @test levels(x) == ["Q1: [-Inf, 3.0)", "Q2: [3.0, 5.0]"]
+end
+
+@testset "cut when quantile falls exactly on a data value" begin
+    x = cut([11, 14, 43, 54, 54, 56, 73, 79, 84, 84], 3)
+    @test x ==
+        ["Q1: [11, 54)", "Q1: [11, 54)", "Q1: [11, 54)",
+        "Q2: [54, 73)", "Q2: [54, 73)", "Q2: [54, 73)",
+        "Q3: [73, 84]", "Q3: [73, 84]", "Q3: [73, 84]", "Q3: [73, 84]"]
+    @test levels(x) == ["Q1: [11, 54)", "Q2: [54, 73)", "Q3: [73, 84]"]
 end
 
 end
\ No newline at end of file

From e4a13b149327743bd18b52423f4747ceae65f970 Mon Sep 17 00:00:00 2001
From: Milan Bouchet-Valat <nalimilan@club.fr>
Date: Wed, 21 May 2025 18:59:20 +0200
Subject: [PATCH 21/25] Simplify default `cut` labels (#422)

1) The quantile number isn't needed in most cases in the label,
and anyway it's shown when printing an ordered `CategoricalValue`.
Only use it by default when `allowempty=true` to avoid data-dependent
errors if there are duplicate levels.

2) Round breaks by default to a number of significant digits chosen by
`sigdigits`. This number is increased if necessary for breaks to remain unique.
This generates labels which are not completely correct as rounding may make
the left break greater than a value which is included in the interval,
but this is generally minor and expected. Taking the floor rather than
rounding would be more correct, but it can generate unexpected labels
due to floating point trickiness (e.g. `floor(0.0003, sigdigits=4)`
gives 0.0002999). This is what R does.

Add a deprecation to avoid breaking custom `labels` functions which did
not accept `sigdigits`.
---
 Project.toml             |   2 +-
 src/CategoricalArrays.jl |   4 +-
 src/extras.jl            | 182 +++++++++++++++++++++++++++++++--------
 test/15_extras.jl        | 148 ++++++++++++++++++++-----------
 4 files changed, 246 insertions(+), 90 deletions(-)

diff --git a/Project.toml b/Project.toml
index 4de04411..2c345ff7 100644
--- a/Project.toml
+++ b/Project.toml
@@ -28,7 +28,7 @@ CategoricalArraysStructTypesExt = "StructTypes"
 
 [compat]
 Arrow = "2"
-Compat = "3.37, 4"
+Compat = "3.47, 4.10"
 DataAPI = "1.6"
 JSON = "0.15, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21"
 JSON3 = "1.1.2"
diff --git a/src/CategoricalArrays.jl b/src/CategoricalArrays.jl
index e597c344..f3383645 100644
--- a/src/CategoricalArrays.jl
+++ b/src/CategoricalArrays.jl
@@ -11,10 +11,12 @@ module CategoricalArrays
     import DataAPI: unwrap
     export unwrap
 
+    using Compat
+    @compat public default_formatter, numbered_formatter
+
     using DataAPI
     using Missings
     using Printf
-    import Compat
 
     # JuliaLang/julia#36810
     if VERSION < v"1.5.2"
diff --git a/src/extras.jl b/src/extras.jl
index 3f27aba6..60f32a64 100644
--- a/src/extras.jl
+++ b/src/extras.jl
@@ -27,17 +27,67 @@ function fill_refs!(refs::AbstractArray, X::AbstractArray,
     end
 end
 
+if VERSION >= v"1.10"
+    const CUT_FMT = Printf.Format("%.*g")
+end
+
 """
-    default_formatter(from, to, i; leftclosed, rightclosed)
+    CategoricalArrays.default_formatter(from, to, i::Integer;
+                                        leftclosed::Bool, rightclosed::Bool,
+                                        sigdigits::Integer)
 
-Provide the default label format for the `cut(x, breaks)` method.
+Provide the default label format for the `cut(x, breaks)` method,
+which is `"[from, to)"` if `leftclosed` is `true` and `"[from, to)"` otherwise.
+
+If they are floating points values, breaks are turned into to strings using
+`@sprintf("%.*g", sigdigits, break)`
+(or `to` using `@sprintf("%.*g", sigdigits, break)` for the last break).
 """
-default_formatter(from, to, i; leftclosed, rightclosed) =
-    string(leftclosed ? "[" : "(", from, ", ", to, rightclosed ? "]" : ")")
+function default_formatter(from, to, i::Integer;
+                           leftclosed::Bool, rightclosed::Bool,
+                           sigdigits::Integer)
+    @static if VERSION >= v"1.10"
+        from_str = from isa AbstractFloat ?
+            Printf.format(CUT_FMT, sigdigits, from) :
+            string(from)
+        to_str = to isa AbstractFloat ?
+            Printf.format(CUT_FMT, sigdigits, to) :
+            string(to)
+    else
+        from_str = from isa AbstractFloat ?
+            Printf.format(Printf.Format("%.$(sigdigits)g"), from) :
+            string(from)
+        to_str = to isa AbstractFloat ?
+            Printf.format(Printf.Format("%.$(sigdigits)g"), to) :
+            string(to)
+    end
+    string(leftclosed ? "[" : "(", from_str, ", ", to_str, rightclosed ? "]" : ")")
+end
+
+"""
+    CategoricalArrays.numbered_formatter(from, to, i::Integer;
+                                         leftclosed::Bool, rightclosed::Bool,
+                                         sigdigits::Integer)
+
+Provide the default label format for the `cut(x, ngroups)` method
+when `allowempty=true`, which is `"i: [from, to)"` if `leftclosed`
+is `true` and `"i: [from, to)"` otherwise.
+
+If they are floating points values, breaks are turned into to strings using
+`@sprintf("%.*g", sigdigits, breaks)`
+(or `to` using `@sprintf("%.*g", sigdigits, break)` for the last break).
+"""
+numbered_formatter(from, to, i::Integer;
+                   leftclosed::Bool, rightclosed::Bool,
+                   sigdigits::Integer) =
+    string(i, ": ",
+           default_formatter(from, to, i, leftclosed=leftclosed, rightclosed=rightclosed,
+                             sigdigits=sigdigits))
 
 @doc raw"""
     cut(x::AbstractArray, breaks::AbstractVector;
         labels::Union{AbstractVector,Function},
+        sigdigits::Integer=3,
         extend::Union{Bool,Missing}=false, allowempty::Bool=false)
 
 Cut a numeric array into intervals at values `breaks`
@@ -49,15 +99,25 @@ the last interval, which is closed on both ends, i.e. `[lower, upper]`.
 If `x` accepts missing values (i.e. `eltype(x) >: Missing`) the returned array will
 also accept them.
 
+!!! note
+    For floating point data, breaks may be rounded to `sigdigits` significant digits
+    when generating interval labels, meaning that they may not reflect exactly the cutpoints
+    used.
+
 # Keyword arguments
 * `extend::Union{Bool, Missing}=false`: when `false`, an error is raised if some values
   in `x` fall outside of the breaks; when `true`, breaks are automatically added to include
   all values in `x`; when `missing`, values outside of the breaks generate `missing` entries.
 * `labels::Union{AbstractVector, Function}`: a vector of strings, characters
-  or numbers giving the names to use for
-  the intervals; or a function `f(from, to, i; leftclosed, rightclosed)` that generates
+  or numbers giving the names to use for the intervals; or a function
+  `f(from, to, i::Integer; leftclosed::Bool, rightclosed::Bool, sigdigits::Integer)` that generates
   the labels from the left and right interval boundaries and the group index. Defaults to
-  `"[from, to)"` (or `"[from, to]"` for the rightmost interval if `extend == true`).
+  [`CategoricalArrays.default_formatter`](@ref), giving `"[from, to)"` (or `"[from, to]"`
+  for the rightmost interval if `extend == true`).
+* `sigdigits::Integer=3`: the minimum number of significant digits to use in labels.
+  This value is increased automatically if necessary so that rounded breaks are unique.
+  Only used for floating point types and when `labels` is a function, in which case it
+  is passed to it as a keyword argument.
 * `allowempty::Bool=false`: when `false`, an error is raised if some breaks other than
   the last one appear multiple times, generating empty intervals; when `true`,
   duplicate breaks are allowed and the intervals they generate are kept as
@@ -69,19 +129,19 @@ julia> using CategoricalArrays
 
 julia> cut(-1:0.5:1, [0, 1], extend=true)
 5-element CategoricalArray{String,1,UInt32}:
- "[-1.0, 0.0)"
- "[-1.0, 0.0)"
- "[0.0, 1.0]"
- "[0.0, 1.0]"
- "[0.0, 1.0]" 
+ "[-1, 0)"
+ "[-1, 0)"
+ "[0, 1]"
+ "[0, 1]"
+ "[0, 1]" 
 
 julia> cut(-1:0.5:1, 2)
 5-element CategoricalArray{String,1,UInt32}:
- "Q1: [-1.0, 0.0)"
- "Q1: [-1.0, 0.0)"
- "Q2: [0.0, 1.0]"
- "Q2: [0.0, 1.0]"
- "Q2: [0.0, 1.0]"
+ "[-1, 0)"
+ "[-1, 0)"
+ "[0, 1]"
+ "[0, 1]"
+ "[0, 1]"
 
 julia> cut(-1:0.5:1, 2, labels=["A", "B"])
 5-element CategoricalArray{String,1,UInt32}:
@@ -114,15 +174,17 @@ julia> cut(-1:0.5:1, 3, labels=fmt)
 @inline function cut(x::AbstractArray, breaks::AbstractVector;
                      extend::Union{Bool, Missing}=false,
                      labels::Union{AbstractVector{<:SupportedTypes},Function}=default_formatter,
+                     sigdigits::Integer=3,
                      allowempty::Bool=false)
-    return _cut(x, breaks, extend, labels, allowempty)
+    return _cut(x, breaks, extend, labels, sigdigits, allowempty)
 end
 
 # Separate function for inferability (thanks to inlining of cut)
 function _cut(x::AbstractArray{T, N}, breaks::AbstractVector,
               extend::Union{Bool, Missing},
               labels::Union{AbstractVector{<:SupportedTypes},Function},
-              allowempty::Bool=false) where {T, N}
+              sigdigits::Integer,
+              allowempty::Bool) where {T, N}
     if !issorted(breaks)
         breaks = sort(breaks)
     end
@@ -179,21 +241,60 @@ function _cut(x::AbstractArray{T, N}, breaks::AbstractVector,
         end
     end
 
+    # Find minimal number of digits so that distinct breaks remain so
+    if eltype(breaks) <: AbstractFloat
+        while true
+            local i
+            for outer i in 2:lastindex(breaks)
+                b1 = breaks[i-1]
+                b2 = breaks[i]
+                isequal(b1, b2) && continue
+
+                @static if VERSION >= v"1.9"
+                    b1_str = Printf.format(CUT_FMT, sigdigits, b1)
+                    b2_str = Printf.format(CUT_FMT, sigdigits, b2)
+                else
+                    b1_str = Printf.format(Printf.Format("%.$(sigdigits)g"), b1)
+                    b2_str = Printf.format(Printf.Format("%.$(sigdigits)g"), b2)
+                end
+                if b1_str == b2_str
+                    sigdigits += 1
+                    break
+                end
+            end
+            i == lastindex(breaks) && break
+        end
+    end
     n = length(breaks)
     n >= 2 || throw(ArgumentError("at least two breaks must be provided when extend is not true"))
     if labels isa Function
         from = breaks[1:n-1]
         to = breaks[2:n]
-        firstlevel = labels(from[1], to[1], 1,
-                            leftclosed=!isequal(breaks[1], breaks[2]), rightclosed=false)
+        local firstlevel
+        try
+            firstlevel = labels(from[1], to[1], 1,
+                                leftclosed=!isequal(breaks[1], breaks[2]), rightclosed=false,
+                                sigdigits=sigdigits)
+        catch
+            # Support functions defined before v1.0, where sigdigits did not exist
+            Base.depwarn("`labels` function is now required to accept a `sigdigits` keyword argument",
+                         :cut)
+            labels_orig = labels
+            labels = (from, to, i; leftclosed, rightclosed, sigdigits) ->
+                labels_orig(from, to, i; leftclosed, rightclosed)
+            firstlevel = labels_orig(from[1], to[1], 1,
+                                     leftclosed=!isequal(breaks[1], breaks[2]), rightclosed=false)
+        end
         levs = Vector{typeof(firstlevel)}(undef, n-1)
         levs[1] = firstlevel
         for i in 2:n-2
             levs[i] = labels(from[i], to[i], i,
-                             leftclosed=!isequal(breaks[i], breaks[i+1]), rightclosed=false)
+                             leftclosed=!isequal(breaks[i], breaks[i+1]), rightclosed=false,
+                             sigdigits=sigdigits)
         end
         levs[end] = labels(from[end], to[end], n-1,
-                           leftclosed=true, rightclosed=true)
+                           leftclosed=true, rightclosed=true,
+                           sigdigits=sigdigits)
     else
         length(labels) == n-1 ||
             throw(ArgumentError("labels must be of length $(n-1), but got length $(length(labels))"))
@@ -213,14 +314,6 @@ function _cut(x::AbstractArray{T, N}, breaks::AbstractVector,
     CategoricalArray{S, N}(refs, pool)
 end
 
-"""
-    quantile_formatter(from, to, i; leftclosed, rightclosed)
-
-Provide the default label format for the `cut(x, ngroups)` method.
-"""
-quantile_formatter(from, to, i; leftclosed, rightclosed) =
-    string("Q", i, ": ", leftclosed ? "[" : "(", from, ", ", to, rightclosed ? "]" : ")")
-
 """
 Find first value in (sorted) `v` which is greater than or equal to each quantile
 in (sorted) `qs`.
@@ -247,6 +340,7 @@ end
 """
     cut(x::AbstractArray, ngroups::Integer;
         labels::Union{AbstractVector{<:AbstractString},Function},
+        sigdigits::Integer=3,
         allowempty::Bool=false)
 
 Cut a numeric array into `ngroups` quantiles.
@@ -257,19 +351,32 @@ but breaks are taken from actual data values instead of estimated quantiles.
 If `x` contains `missing` values, they are automatically skipped when computing
 quantiles.
 
+!!! note
+    For floating point data, breaks may be rounded to `sigdigits` significant digits
+    when generating interval labels, meaning that they may not reflect exactly the cutpoints
+    used.
+
 # Keyword arguments
 * `labels::Union{AbstractVector, Function}`: a vector of strings, characters
-  or numbers giving the names to use for
-  the intervals; or a function `f(from, to, i; leftclosed, rightclosed)` that generates
+  or numbers giving the names to use for the intervals; or a function
+  `f(from, to, i::Integer; leftclosed::Bool, rightclosed::Bool, sigdigits::Integer)` that generates
   the labels from the left and right interval boundaries and the group index. Defaults to
-  `"Qi: [from, to)"` (or `"Qi: [from, to]"` for the rightmost interval).
+  [`CategoricalArrays.default_formatter`](@ref), giving `"[from, to)"` (or `"[from, to]"`
+  for the rightmost interval if `extend == true`) if `allowempty=false`, otherwise to
+  [`CategoricalArrays.numbered_formatter`](@ref), which prefixes the label with the quantile
+  number to ensure uniqueness.
+* `sigdigits::Integer=3`: the minimum number of significant digits to use when rounding
+  breaks for inclusion in generated labels. This value is increased automatically if necessary
+  so that rounded breaks are unique. Only used for floating point types and when `labels` is a
+  function, in which case it is passed to it as a keyword argument.
 * `allowempty::Bool=false`: when `false`, an error is raised if some quantiles breakpoints
   other than the last one are equal, generating empty intervals;
   when `true`, duplicate breaks are allowed and the intervals they generate are kept as
   unused levels (but duplicate labels are not allowed).
 """
 function cut(x::AbstractArray, ngroups::Integer;
-             labels::Union{AbstractVector{<:SupportedTypes},Function}=quantile_formatter,
+             labels::Union{AbstractVector{<:SupportedTypes},Function,Nothing}=nothing,
+             sigdigits::Integer=3,
              allowempty::Bool=false)
     ngroups >= 1 || throw(ArgumentError("ngroups must be strictly positive (got $ngroups)"))
     sorted_x = eltype(x) >: Missing ? sort!(collect(skipmissing(x))) : sort(x)
@@ -286,5 +393,8 @@ function cut(x::AbstractArray, ngroups::Integer;
                             "Pass `allowempty=true` to allow empty quantiles or " *
                             "choose a lower value for `ngroups`."))
     end
-    cut(x, breaks; labels=labels, allowempty=allowempty)
+    if labels === nothing
+        labels = allowempty ? numbered_formatter : default_formatter
+    end
+    return cut(x, breaks; labels=labels, sigdigits=sigdigits, allowempty=allowempty)
 end
diff --git a/test/15_extras.jl b/test/15_extras.jl
index af4f79f5..5df7860b 100644
--- a/test/15_extras.jl
+++ b/test/15_extras.jl
@@ -93,10 +93,10 @@ const ≅ = isequal
     @test levels(x) == ["b", "a"]
 
     x = @inferred cut(Matrix{Union{Float64, T}}([-1.1 3.0; 1.456 10.394]), [-2.134, 3.0, 12.5])
-    @test x == ["[-2.134, 3.0)" "[3.0, 12.5]"; "[-2.134, 3.0)" "[3.0, 12.5]"]
+    @test x == ["[-2.13, 3)" "[3, 12.5]"; "[-2.13, 3)" "[3, 12.5]"]
     @test isa(x, CategoricalMatrix{Union{String, T}})
     @test isordered(x)
-    @test levels(x) == ["[-2.134, 3.0)", "[3.0, 12.5]"]
+    @test levels(x) == ["[-2.13, 3)", "[3, 12.5]"]
 
     labels = 0:2:8
     x = @inferred cut(Vector{Union{T, Int}}(1:8), 0:2:10, labels=labels)
@@ -127,18 +127,18 @@ end
 
 @testset "cut([5, 4, 3, 2], 2)" begin
     x = @inferred cut([5, 4, 3, 2], 2)
-    @test x == ["Q2: [4, 5]", "Q2: [4, 5]", "Q1: [2, 4)", "Q1: [2, 4)"]
+    @test x == ["[4, 5]", "[4, 5]", "[2, 4)", "[2, 4)"]
     @test isa(x, CategoricalArray)
     @test isordered(x)
-    @test levels(x) == ["Q1: [2, 4)", "Q2: [4, 5]"]
+    @test levels(x) == ["[2, 4)", "[4, 5]"]
 end
 
 @testset "cut(x, n) with missing values" begin
     x = @inferred cut([5, 4, 3, missing, 2], 2)
-    @test x ≅ ["Q2: [4, 5]", "Q2: [4, 5]", "Q1: [2, 4)", missing, "Q1: [2, 4)"]
+    @test x ≅ ["[4, 5]", "[4, 5]", "[2, 4)", missing, "[2, 4)"]
     @test isa(x, CategoricalArray)
     @test isordered(x)
-    @test levels(x) == ["Q1: [2, 4)", "Q2: [4, 5]"]
+    @test levels(x) == ["[2, 4)", "[4, 5]"]
 end
 
 @testset "cut(x, n) with invalid n" begin
@@ -147,7 +147,7 @@ end
 end
 
 @testset "cut with formatter function" begin
-    my_formatter(from, to, i; leftclosed, rightclosed) = "$i: $from -- $to"
+    my_formatter(from, to, i; leftclosed, rightclosed, sigdigits) = "$i: $from -- $to"
 
     x = 0.15:0.20:0.95
     p = [0, 0.4, 0.8, 1.0]
@@ -155,20 +155,24 @@ end
     a = @inferred cut(x, p, labels=my_formatter)
     @test a == ["1: 0.0 -- 0.4", "1: 0.0 -- 0.4", "2: 0.4 -- 0.8", "2: 0.4 -- 0.8", "3: 0.8 -- 1.0"]
 
+    my_old_formatter(from, to, i; leftclosed, rightclosed) = "$i: $from -- $to"
+    a = @test_deprecated r"`labels`.*" cut(x, p, labels=my_old_formatter)
+    @test a == ["1: 0.0 -- 0.4", "1: 0.0 -- 0.4", "2: 0.4 -- 0.8", "2: 0.4 -- 0.8", "3: 0.8 -- 1.0"]
+
     # GH 274
-    my_formatter_2(from, to, i; leftclosed, rightclosed) = "$i: $(from+1) -- $(to+1)"
+    my_formatter_2(from, to, i; leftclosed, rightclosed, sigdigits) = "$i: $(from+1) -- $(to+1)"
     a = @inferred cut(x, p, labels=my_formatter_2)
     @test a == ["1: 1.0 -- 1.4", "1: 1.0 -- 1.4", "2: 1.4 -- 1.8", "2: 1.4 -- 1.8", "3: 1.8 -- 2.0"]
 
     for T in (Union{}, Missing)
-        labels = (from, to, i; leftclosed, rightclosed) -> (to+from)/2
+        labels = (from, to, i; leftclosed, rightclosed, sigdigits) -> (to+from)/2
         a = @inferred cut(Vector{Union{T, Int}}(1:8), 0:2:10, labels=labels)
         @test a == [1.0, 3.0, 3.0, 5.0, 5.0, 7.0, 7.0, 9.0]
         @test isa(a, CategoricalVector{Union{Float64, T}})
         @test isordered(a)
         @test levels(a) == [1.0, 3.0, 5.0, 7.0, 9.0]
 
-        labels = (from, to, i; leftclosed, rightclosed) -> "$((to+from)/2)"
+        labels = (from, to, i; leftclosed, rightclosed, sigdigits) -> "$((to+from)/2)"
         a = @inferred cut(Vector{Union{T, Int}}(1:8), 0:2:10, labels=labels)
         @test a == string.([1.0, 3.0, 3.0, 5.0, 5.0, 7.0, 7.0, 9.0])
         @test isa(a, CategoricalVector{Union{String, T}})
@@ -188,8 +192,8 @@ end
     @test_throws ArgumentError cut(x, [0, 0.1, 0.1, 10])
     @test_throws ArgumentError cut(x, 10)
     y = cut(x, [0, 0.1, 10, 10])
-    @test y == [fill("[0.0, 0.1)", 10); fill("[0.1, 10.0)", 10)]
-    @test levels(y) == ["[0.0, 0.1)", "[0.1, 10.0)", "[10.0, 10.0]"]
+    @test y == [fill("[0, 0.1)", 10); fill("[0.1, 10)", 10)]
+    @test levels(y) == ["[0, 0.1)", "[0.1, 10)", "[10, 10]"]
 
     @test_throws ArgumentError cut(1:10, [1, 5, 5, 11])
     y = cut(1:10, [1, 5, 5, 11], allowempty=true)
@@ -251,55 +255,55 @@ end
     @test_throws ArgumentError cut(1:8, 0:2:10, labels=[0, 1, 1, 2, 3])
     @test_throws ArgumentError cut(1:8, [0, 2, 2, 6, 8, 10], labels=[0, 1, 1, 2, 3], allowempty=true)
 
-    fmt = (from, to, i; leftclosed, rightclosed) -> (i % 2 == 0 ? to : 0.0)
+    fmt = (from, to, i; leftclosed, rightclosed, sigdigits) -> (i % 2 == 0 ? to : 0.0)
     @test_throws ArgumentError cut(1:8, 0:2:10, labels=fmt)
 
     @test_throws ArgumentError cut([fill(1, 10); 4], 2)
     x = cut([fill(1, 10); 4], 2, allowempty=true)
-    @test unique(x) == ["Q2: [1, 4]"]
-    @test levels(x) == ["Q1: (1, 1)", "Q2: [1, 4]"]
+    @test unique(x) == ["2: [1, 4]"]
+    @test levels(x) == ["1: (1, 1)", "2: [1, 4]"]
     @test_throws ArgumentError cut([fill(1, 10); 4], 3)
     x = cut([fill(1, 10); 4], 3, allowempty=true)
-    @test unique(x) == ["Q3: [1, 4]"]
-    @test levels(x) == ["Q1: (1, 1)", "Q2: (1, 1)", "Q3: [1, 4]"]
+    @test unique(x) == ["3: [1, 4]"]
+    @test levels(x) == ["1: (1, 1)", "2: (1, 1)", "3: [1, 4]"]
 
     x = cut([fill(4, 10); 1], 2)
-    @test x == [fill("Q2: [4, 4]", 10); "Q1: [1, 4)"]
-    @test levels(x) == ["Q1: [1, 4)"; "Q2: [4, 4]"]
+    @test x == [fill("[4, 4]", 10); "[1, 4)"]
+    @test levels(x) == ["[1, 4)"; "[4, 4]"]
     @test_throws ArgumentError cut([fill(4, 10); 1], 3)
     x = cut([fill(4, 10); 1], 3, allowempty=true)
-    @test x == [fill("Q3: [4, 4]", 10); "Q1: [1, 4)"]
-    @test levels(x) == ["Q1: [1, 4)", "Q2: (4, 4)", "Q3: [4, 4]"]
+    @test x == [fill("3: [4, 4]", 10); "1: [1, 4)"]
+    @test levels(x) == ["1: [1, 4)", "2: (4, 4)", "3: [4, 4]"]
 
     x = cut([fill(1, 5); fill(4, 5)], 2)
-    @test x == [fill("Q1: [1, 4)", 5); fill("Q2: [4, 4]", 5)]
-    @test levels(x) == ["Q1: [1, 4)", "Q2: [4, 4]"]
+    @test x == [fill("[1, 4)", 5); fill("[4, 4]", 5)]
+    @test levels(x) == ["[1, 4)", "[4, 4]"]
     @test_throws ArgumentError  cut([fill(1, 5); fill(4, 5)], 3)
     x = cut([fill(1, 5); fill(4, 5)], 3, allowempty=true)
-    @test x == [fill("Q2: [1, 4)", 5); fill("Q3: [4, 4]", 5)]
-    @test levels(x) == ["Q1: (1, 1)", "Q2: [1, 4)", "Q3: [4, 4]"]
+    @test x == [fill("2: [1, 4)", 5); fill("3: [4, 4]", 5)]
+    @test levels(x) == ["1: (1, 1)", "2: [1, 4)", "3: [4, 4]"]
 end
 
 @testset "cut with -0.0" begin
     x = cut([-0.0, 0.0, 0.0, -0.0], 2)
-    @test x == ["Q1: [-0.0, 0.0)", "Q2: [0.0, 0.0]", "Q2: [0.0, 0.0]", "Q1: [-0.0, 0.0)"]
-    @test levels(x) == ["Q1: [-0.0, 0.0)", "Q2: [0.0, 0.0]"]
+    @test x == ["[-0, 0)", "[0, 0]", "[0, 0]", "[-0, 0)"]
+    @test levels(x) == ["[-0, 0)", "[0, 0]"]
 
     x = cut([-0.0, 0.0, 0.0, -0.0], [-0.0, 0.0, 0.0])
-    @test x == ["[-0.0, 0.0)", "[0.0, 0.0]", "[0.0, 0.0]", "[-0.0, 0.0)"]
-    @test levels(x) == ["[-0.0, 0.0)", "[0.0, 0.0]"]
+    @test x == ["[-0, 0)", "[0, 0]", "[0, 0]", "[-0, 0)"]
+    @test levels(x) == ["[-0, 0)", "[0, 0]"]
 
     x = cut([-0.0, 0.0, 0.0, -0.0], [-0.0, 0.0])
-    @test x == fill("[-0.0, 0.0]", 4)
-    @test levels(x) == ["[-0.0, 0.0]"]
+    @test x == fill("[-0, 0]", 4)
+    @test levels(x) == ["[-0, 0]"]
 
     x = cut([-0.0, 0.0, 0.0, -0.0], [0.0], extend=true)
-    @test x == fill("[-0.0, 0.0]", 4)
-    @test levels(x) == ["[-0.0, 0.0]"]
+    @test x == fill("[-0, 0]", 4)
+    @test levels(x) == ["[-0, 0]"]
 
     x = cut([-0.0, 0.0, 0.0, -0.0], [-0.0], extend=true)
-    @test x == fill("[-0.0, 0.0]", 4)
-    @test levels(x) == ["[-0.0, 0.0]"]
+    @test x == fill("[-0, 0]", 4)
+    @test levels(x) == ["[-0, 0]"]
 
     x = cut([-0.0, 0.0, 0.0, -0.0], 2, labels=[-0.0, 0.0])
     @test x == [-0.0, 0.0, 0.0, -0.0]
@@ -336,7 +340,7 @@ end
     @test levels(x) == [-0.0, 0.0]
 
     x = @inferred cut(-1:0.5:1, [0, 1], extend=true)
-    @test x == ["[-1.0, 0.0)", "[-1.0, 0.0)", "[0.0, 1.0]", "[0.0, 1.0]", "[0.0, 1.0]"]
+    @test x == ["[-1, 0)", "[-1, 0)", "[0, 1]", "[0, 1]", "[0, 1]"]
 end
 
 @testset "cut with NaN and Inf" begin
@@ -346,37 +350,77 @@ end
     @test_throws ArgumentError("NaN values are not allowed in breaks") cut([1, 2], [1, NaN])
 
     x = cut([1, Inf], [1], extend=true)
-    @test x ≅ ["[1.0, Inf]", "[1.0, Inf]"]
-    @test levels(x) == ["[1.0, Inf]"]
+    @test x ≅ ["[1, Inf]", "[1, Inf]"]
+    @test levels(x) == ["[1, Inf]"]
 
     x = cut([1, -Inf], [1], extend=true)
-    @test x ≅ ["[-Inf, 1.0]", "[-Inf, 1.0]"]
-    @test levels(x) == ["[-Inf, 1.0]"]
+    @test x ≅ ["[-Inf, 1]", "[-Inf, 1]"]
+    @test levels(x) == ["[-Inf, 1]"]
 
     x = cut([1:5; Inf], [1, 2, Inf])
-    @test x ≅ ["[1.0, 2.0)"; fill("[2.0, Inf]", 5)]
-    @test levels(x) == ["[1.0, 2.0)", "[2.0, Inf]"]
+    @test x ≅ ["[1, 2)"; fill("[2, Inf]", 5)]
+    @test levels(x) == ["[1, 2)", "[2, Inf]"]
 
     x = cut([1:5; -Inf], [-Inf, 2, 5])
-    @test x ≅ ["[-Inf, 2.0)"; fill("[2.0, 5.0]", 4); "[-Inf, 2.0)"]
-    @test levels(x) == ["[-Inf, 2.0)", "[2.0, 5.0]"]
+    @test x ≅ ["[-Inf, 2)"; fill("[2, 5]", 4); "[-Inf, 2)"]
+    @test levels(x) == ["[-Inf, 2)", "[2, 5]"]
 
     x = cut([1:5; Inf], 2)
-    @test x ≅ [fill("Q1: [1.0, 4.0)", 3); fill("Q2: [4.0, Inf]", 3)]
-    @test levels(x) == ["Q1: [1.0, 4.0)", "Q2: [4.0, Inf]"]
+    @test x ≅ [fill("[1, 4)", 3); fill("[4, Inf]", 3)]
+    @test levels(x) == ["[1, 4)", "[4, Inf]"]
 
     x = cut([1:5; -Inf], 2)
-    @test x ≅ [fill("Q1: [-Inf, 3.0)", 2); fill("Q2: [3.0, 5.0]", 3); "Q1: [-Inf, 3.0)"]
-    @test levels(x) == ["Q1: [-Inf, 3.0)", "Q2: [3.0, 5.0]"]
+    @test x ≅ [fill("[-Inf, 3)", 2); fill("[3, 5]", 3); "[-Inf, 3)"]
+    @test levels(x) == ["[-Inf, 3)", "[3, 5]"]
 end
 
 @testset "cut when quantile falls exactly on a data value" begin
     x = cut([11, 14, 43, 54, 54, 56, 73, 79, 84, 84], 3)
     @test x ==
-        ["Q1: [11, 54)", "Q1: [11, 54)", "Q1: [11, 54)",
-        "Q2: [54, 73)", "Q2: [54, 73)", "Q2: [54, 73)",
-        "Q3: [73, 84]", "Q3: [73, 84]", "Q3: [73, 84]", "Q3: [73, 84]"]
-    @test levels(x) == ["Q1: [11, 54)", "Q2: [54, 73)", "Q3: [73, 84]"]
+        ["[11, 54)", "[11, 54)", "[11, 54)",
+        "[54, 73)", "[54, 73)", "[54, 73)",
+        "[73, 84]", "[73, 84]", "[73, 84]", "[73, 84]"]
+    @test levels(x) == ["[11, 54)", "[54, 73)", "[73, 84]"]
+end
+
+@testset "cut computation of sigdigits" begin
+    x = cut([1.2, 1.3, 2], 2)
+    @test levels(x) == ["[1.2, 1.3)", "[1.3, 2]"]
+
+    x = cut([1.0, 2.0, 3.0], 2)
+    @test levels(x) == ["[1, 2)", "[2, 3]"]
+
+    x = cut([1.00002, 1.00003, 2], 2)
+    @test levels(x) == ["[1.00002, 1.00003)", "[1.00003, 2]"]
+
+    x = cut([1.00002, 1.00003, 1.00005, 2], 2)
+    @test levels(x) == ["[1, 1.0001)", "[1.0001, 2]"]
+
+    x = cut([1.00001, 1.00002, 1.00002, 2], 2)
+    @test levels(x) == ["[1.00001, 1.00002)", "[1.00002, 2]"]
+
+    x = cut([1.00001, 1.00003, 1.1, 2], 2)
+    @test levels(x) == ["[1, 1.1)", "[1.1, 2]"]
+
+    # @sprintf with %g uses scientific notation even in some cases
+    # where classic notation would be shorter
+    x = cut([1.0, 10.0, 100.0, 1000.0], [1.0, 10.0, 100.0, 1000.0])
+    @test levels(x) == ["[1, 10)", "[10, 100)", "[100, 1e+03]"]
+    # But integers are rendered using plain `string`
+    x = cut([1, 10, 100], [1, 10, 100, 1000])
+    @test levels(x) == ["[1, 10)", "[10, 100)", "[100, 1000]"]
+
+    # Extreme case
+    x = cut([8.85718832925723e-7, 8.572446994052413e-7, 1.40217695121027e-7, 8.966449714804087e-7,
+             3.070384341319470e-7, 3.070384341319471e-7, 1.8520709563325888e-7, 5.630461710066611e-7,
+             6.781422109070843e-7, 4.776113711396994e-7, 0.2538909094146984, 0.5249665525921473,
+             0.8321957380046366, 0.9648282851978118, 0.36084175275805797, 0.7851054639425253,
+             0.6875195857202754, 0.614940093507575, 0.6224944997292978, 0.6055683461790675,
+             5.349085340927365e11, 1.3471583229449602e11, 6.538893396835975e11, 4.826316844547661e11,
+             8.803607035550856e11, 1.8174694671397316e10, 1.6709745443719125e11, 3.2050577954311835e11,
+             1.6134999167460663e11, 7.396308745225059e11], 3)
+    @test levels(x) == ["[1.4e-07, 0.254)", "[0.254, 1.82e+10)", "[1.82e+10, 8.8e+11]"]
+
 end
 
 end
\ No newline at end of file

From 11d43c1bc1c2b67046867fd9bca318443fa46a25 Mon Sep 17 00:00:00 2001
From: Milan Bouchet-Valat <nalimilan@club.fr>
Date: Wed, 21 May 2025 19:35:31 +0200
Subject: [PATCH 22/25] Support weighted quantiles in `cut` (#423)

This requires adding an extension point for StatsBase.
Unfortunately more copies of the data and weights are done than necessary
as StatsBase does not support in-place weighted quantile! on pre-sorted data
nor taking a view of weights vectors (JuliaStats/StatsBase.jl#723).
---
 Project.toml                         |  6 +++-
 ext/CategoricalArraysStatsBaseExt.jl | 13 ++++++++
 src/CategoricalArrays.jl             |  1 +
 src/extras.jl                        | 44 +++++++++++++++++++++++-----
 test/15_extras.jl                    | 24 +++++++++++++++
 5 files changed, 79 insertions(+), 9 deletions(-)
 create mode 100644 ext/CategoricalArraysStatsBaseExt.jl

diff --git a/Project.toml b/Project.toml
index 2c345ff7..a9262e93 100644
--- a/Project.toml
+++ b/Project.toml
@@ -16,6 +16,7 @@ Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
 Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
+StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 SentinelArrays = "91c51154-3ec4-41a3-a24f-3f23e20d615c"
 StructTypes = "856f2bd8-1eba-4b0a-8007-ebc267875bd4"
 
@@ -23,6 +24,7 @@ StructTypes = "856f2bd8-1eba-4b0a-8007-ebc267875bd4"
 CategoricalArraysArrowExt = "Arrow"
 CategoricalArraysJSONExt = "JSON"
 CategoricalArraysRecipesBaseExt = "RecipesBase"
+CategoricalArraysStatsBaseExt = "StatsBase"
 CategoricalArraysSentinelArraysExt = "SentinelArrays"
 CategoricalArraysStructTypesExt = "StructTypes"
 
@@ -37,6 +39,7 @@ RecipesBase = "1.1"
 Requires = "1"
 SentinelArrays = "1"
 Statistics = "1"
+StatsBase = "0.15, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.30, 0.31, 0.32, 0.33, 0.34"
 StructTypes = "1"
 julia = "1.6"
 
@@ -49,8 +52,9 @@ PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
 RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
 RecipesPipeline = "01d81517-befc-4cb6-b9ec-a95719d0359c"
 SentinelArrays = "91c51154-3ec4-41a3-a24f-3f23e20d615c"
+StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 StructTypes = "856f2bd8-1eba-4b0a-8007-ebc267875bd4"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Arrow", "Dates", "JSON", "JSON3", "PooledArrays", "RecipesBase", "RecipesPipeline", "SentinelArrays", "StructTypes", "Test"]
+test = ["Arrow", "Dates", "JSON", "JSON3", "PooledArrays", "RecipesBase", "RecipesPipeline", "SentinelArrays", "StatsBase", "StructTypes", "Test"]
diff --git a/ext/CategoricalArraysStatsBaseExt.jl b/ext/CategoricalArraysStatsBaseExt.jl
new file mode 100644
index 00000000..8cbd5c61
--- /dev/null
+++ b/ext/CategoricalArraysStatsBaseExt.jl
@@ -0,0 +1,13 @@
+module CategoricalArraysStatsBaseExt
+
+if isdefined(Base, :get_extension)
+    import CategoricalArrays: _wquantile
+    using StatsBase
+else
+    import ..CategoricalArrays: _wquantile
+    using ..StatsBase
+end
+
+_wquantile(x::AbstractArray, w::AbstractWeights, p::AbstractVector) = quantile(x, w, p)
+
+end
diff --git a/src/CategoricalArrays.jl b/src/CategoricalArrays.jl
index f3383645..f44b3c2f 100644
--- a/src/CategoricalArrays.jl
+++ b/src/CategoricalArrays.jl
@@ -45,6 +45,7 @@ module CategoricalArrays
             @require JSON="682c06a0-de6a-54ab-a142-c8b1cf79cde6" include("../ext/CategoricalArraysJSONExt.jl")
             @require RecipesBase="3cdcf5f2-1ef4-517c-9805-6587b60abb01" include("../ext/CategoricalArraysRecipesBaseExt.jl")
             @require SentinelArrays="91c51154-3ec4-41a3-a24f-3f23e20d615c" include("../ext/CategoricalArraysSentinelArraysExt.jl")
+            @require StatsBase="2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" include("../ext/CategoricalArraysStatsBaseExt.jl")
             @require StructTypes="856f2bd8-1eba-4b0a-8007-ebc267875bd4" include("../ext/CategoricalArraysStructTypesExt.jl")
         end
     end
diff --git a/src/extras.jl b/src/extras.jl
index 60f32a64..910c6e46 100644
--- a/src/extras.jl
+++ b/src/extras.jl
@@ -337,11 +337,17 @@ function find_breaks(v::AbstractVector, qs::AbstractVector)
     return breaks
 end
 
+# AbstractWeights method is defined in StatsBase extension
+# There is no in-place weighted quantile method in StatsBase
+_wquantile(x::AbstractArray, w::AbstractVector, p::AbstractVector) =
+    throw(ArgumentError("`weights` must be an `AbstractWeights` vector from StatsBase.jl"))
+
 """
     cut(x::AbstractArray, ngroups::Integer;
         labels::Union{AbstractVector{<:AbstractString},Function},
         sigdigits::Integer=3,
-        allowempty::Bool=false)
+        allowempty::Bool=false,
+        weights::Union{AbstractWeights, Nothing}=nothing)
 
 Cut a numeric array into `ngroups` quantiles.
 
@@ -373,19 +379,41 @@ quantiles.
   other than the last one are equal, generating empty intervals;
   when `true`, duplicate breaks are allowed and the intervals they generate are kept as
   unused levels (but duplicate labels are not allowed).
+* `weights::Union{AbstractWeights, Nothing}=nothing`: observations weights to used when
+  computing quantiles (see `quantile` documentation in StatsBase).
 """
 function cut(x::AbstractArray, ngroups::Integer;
              labels::Union{AbstractVector{<:SupportedTypes},Function,Nothing}=nothing,
              sigdigits::Integer=3,
-             allowempty::Bool=false)
+             allowempty::Bool=false,
+             weights::Union{AbstractVector, Nothing}=nothing)
     ngroups >= 1 || throw(ArgumentError("ngroups must be strictly positive (got $ngroups)"))
-    sorted_x = eltype(x) >: Missing ? sort!(collect(skipmissing(x))) : sort(x)
-    min_x, max_x = first(sorted_x), last(sorted_x)
-    if (min_x isa Number && isnan(min_x)) ||
-        (max_x isa Number && isnan(max_x))
-        throw(ArgumentError("NaN values are not allowed in input vector"))
+    if weights === nothing
+        sorted_x = eltype(x) >: Missing ? sort!(collect(skipmissing(x))) : sort(x)
+        min_x, max_x = first(sorted_x), last(sorted_x)
+        if (min_x isa Number && isnan(min_x)) ||
+            (max_x isa Number && isnan(max_x))
+            throw(ArgumentError("NaN values are not allowed in input vector"))
+        end
+        qs = quantile!(sorted_x, (1:(ngroups-1))/ngroups, sorted=true)
+    else
+        if eltype(x) >: Missing
+            nm_inds = findall(!ismissing, x)
+            nm_x = view(x, nm_inds)
+            # TODO: use a view once this is supported (JuliaStats/StatsBase.jl#723)
+            nm_weights = weights[nm_inds]
+        else
+            nm_x = x
+            nm_weights = weights
+        end
+        sorted_x = sort(nm_x)
+        min_x, max_x = first(sorted_x), last(sorted_x)
+        if (min_x isa Number && isnan(min_x)) ||
+            (max_x isa Number && isnan(max_x))
+            throw(ArgumentError("NaN values are not allowed in input vector"))
+        end
+        qs = _wquantile(nm_x, nm_weights, (1:(ngroups-1))/ngroups)
     end
-    qs = quantile!(sorted_x, (1:(ngroups-1))/ngroups, sorted=true)
     breaks = [min_x; find_breaks(sorted_x, qs); max_x]
     if !allowempty && !allunique(@view breaks[1:end-1])
         throw(ArgumentError("cannot compute $ngroups quantiles due to " *
diff --git a/test/15_extras.jl b/test/15_extras.jl
index 5df7860b..80dc14b7 100644
--- a/test/15_extras.jl
+++ b/test/15_extras.jl
@@ -1,6 +1,8 @@
 module TestExtras
 using Test
 using CategoricalArrays
+using StatsBase
+using Missings
 
 const ≅ = isequal
 
@@ -423,4 +425,26 @@ end
 
 end
 
+@testset "cut with weighted quantiles" begin
+    @test_throws ArgumentError cut(1:3, 3, weights=1:3)
+
+    x = collect(Float64, 1:100)
+    w = fweights(repeat(1:10, inner=10))
+    y = cut(x, 10, weights=w)
+    @test levelcode.(y) == levelcode.(cut(x, quantile(x, w, (0:10)./10)))
+    @test levels(y) == ["[1, 29)", "[29, 43)", "[43, 53)", "[53, 62)", "[62, 70)",
+                        "[70, 77)", "[77, 83)", "[83, 89)", "[89, 95)", "[95, 100]"]
+
+    mx = allowmissing(x)
+    mx[2] = mx[10] = missing
+    nm_inds = .!ismissing.(mx)
+    y = cut(mx, 10, weights=w)
+    @test levelcode.(y) ≅ levelcode.(cut(mx, quantile(x[nm_inds], w[nm_inds], (0:10)./10)))
+    @test levels(y) == ["[1, 30)", "[30, 43)", "[43, 53)", "[53, 62)", "[62, 70)",
+                        "[70, 77)", "[77, 83)", "[83, 89)", "[89, 95)", "[95, 100]"]
+
+    x[5] = NaN
+    @test_throws ArgumentError cut(x, 3, weights=w)
+end
+
 end
\ No newline at end of file

From 13a9bad33f1500b950f355e837ba546d428c0e38 Mon Sep 17 00:00:00 2001
From: Milan Bouchet-Valat <nalimilan@club.fr>
Date: Fri, 23 May 2025 23:32:51 +0200
Subject: [PATCH 23/25] Add `Array` constructors and `convert` methods (#420)

Consistent with existing `similar` methods and the `Array` constructor, ensure
`T(::CategoricalArray{U})` and `convert(T, ::CategoricalArray{U})` return an
`Array{U}` for `T` in `Array`, `Vector`, `Matrix`. Same for `SubArray`s
of `CategoricalArray`s. This avoids creating `Array{<:CategoricalValue}`
objects which are inefficient and unlikely to be what users want.
---
 src/array.jl           |  17 +++++--
 test/13_arraycommon.jl | 100 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 112 insertions(+), 5 deletions(-)

diff --git a/src/array.jl b/src/array.jl
index c462e7d4..8101fb56 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -1,6 +1,7 @@
 ## Code for CategoricalArray
 
-import Base: Array, convert, collect, copy, getindex, setindex!, similar, size,
+import Base: Array, Vector, Matrix, convert, collect, copy, getindex,
+             setindex!, similar, size,
              unique, unique!, vcat, in, summary, float, complex, copyto!
 
 # Used for keyword argument default value
@@ -410,6 +411,12 @@ convert(::Type{CategoricalArray{T, N}}, A::CategoricalArray{T, N}) where {T, N}
 convert(::Type{CategoricalArray{T}}, A::CategoricalArray{T}) where {T} = A
 convert(::Type{CategoricalArray}, A::CategoricalArray) = A
 
+convert(::Type{Array{S, N}}, A::CatArrOrSub{T, N}) where {S, T, N} =
+    collect(S, A)
+convert(::Type{Array}, A::CatArrOrSub) = unwrap.(A)
+convert(::Type{Vector}, A::CatArrOrSub) = unwrap.(A)
+convert(::Type{Matrix}, A::CatArrOrSub) = unwrap.(A)
+
 function Base.:(==)(A::CategoricalArray{S}, B::CategoricalArray{T}) where {S, T}
     if size(A) != size(B)
         return false
@@ -1048,8 +1055,10 @@ function in(x::CategoricalValue, y::CategoricalArray{T, N, R}) where {T, N, R}
     end
 end
 
-Array(A::CategoricalArray{T}) where {T} = Array{T}(A)
-collect(A::CategoricalArray) = copy(A)
+Array(A::CatArrOrSub{T}) where {T} = Array{T}(A)
+Vector(A::CatArrOrSub{T}) where {T} = Vector{T}(A)
+Matrix(A::CatArrOrSub{T}) where {T} = Matrix{T}(A)
+collect(A::CatArrOrSub) = copy(A)
 
 # Defined for performance
 collect(x::Base.SkipMissing{<: CatArrOrSub{T}}) where {T} =
@@ -1119,7 +1128,7 @@ function Base.sort!(v::CategoricalVector;
     levs = eltype(v) >: Missing ?
         eltype(v)[i == 0 ? missing : CategoricalValue(v.pool, i) for i in 0:length(v.pool)] :
         eltype(v)[CategoricalValue(v.pool, i) for i in 1:length(v.pool)]
-    sortedlevs = sort!(Vector(view(levs, seen)), order=ord)
+    sortedlevs = sort!(Vector{eltype(levs)}(view(levs, seen)), order=ord)
     levelsmap = something.(indexin(sortedlevs, levs))
     j = 0
     refs = v.refs
diff --git a/test/13_arraycommon.jl b/test/13_arraycommon.jl
index 02b51bd7..2cf3ff6b 100644
--- a/test/13_arraycommon.jl
+++ b/test/13_arraycommon.jl
@@ -1330,18 +1330,116 @@ end
     @test levels(x) == [2, 1, 3, 4]
 end
 
-@testset "Array(::CategoricalArray{T}) produces Array{T}" begin
+@testset "Array(::CatArrOrSub{T}) produces Array{T}" begin
     x = [1,1,2,2]
     y = categorical(x)
     z = Array(y)
     @test typeof(x) == typeof(z)
     @test z == x
+    z = Array(view(x, 1:4))
+    @test typeof(x) == typeof(z)
+    @test z == x
 
     x = [1,1,2,missing]
     y = categorical(x)
     z = Array(y)
     @test typeof(x) == typeof(z)
     @test z ≅ x
+    z = Array(view(x, 1:4))
+    @test typeof(x) == typeof(z)
+    @test z ≅ x
+
+    x = [1,1,2,2]
+    y = categorical(x)
+    z = Vector(y)
+    @test typeof(x) == typeof(z)
+    @test z == x
+    z = Vector(view(x, 1:4))
+    @test typeof(x) == typeof(z)
+    @test z == x
+
+    x = [1,1,2,missing]
+    y = categorical(x)
+    z = Vector(y)
+    @test typeof(x) == typeof(z)
+    @test z ≅ x
+    z = Vector(view(x, 1:4))
+    @test typeof(x) == typeof(z)
+    @test z ≅ x
+
+    x = [1 1 2 2]
+    y = categorical(x)
+    z = Matrix(y)
+    @test typeof(x) == typeof(z)
+    @test z == x
+    z = Matrix(view(x, :, 1:4))
+    @test typeof(x) == typeof(z)
+    @test z == x
+
+    x = [1 1 2 missing]
+    y = categorical(x)
+    z = Matrix(y)
+    @test typeof(x) == typeof(z)
+    @test z ≅ x
+    z = Matrix(view(x, :, 1:4))
+    @test typeof(x) == typeof(z)
+    @test z ≅ x
+end
+
+@testset "convert(Array, ::CatArrOrSub{T}) produces Array{T}" begin
+    x = [1,1,2,2]
+    y = categorical(x)
+    z = convert(Array, y)
+    @test typeof(x) == typeof(z)
+    @test z == x
+    z = Array(view(x, 1:4))
+    @test typeof(x) == typeof(z)
+    @test z == x
+
+    x = [1,1,2,missing]
+    y = categorical(x)
+    z = convert(Array, y)
+    @test typeof(x) == typeof(z)
+    @test z ≅ x
+    z = Array(view(x, 1:4))
+    @test typeof(x) == typeof(z)
+    @test z ≅ x
+
+    x = [1,1,2,2]
+    y = categorical(x)
+    z = convert(Vector, y)
+    @test typeof(x) == typeof(z)
+    @test z == x
+    z = Vector(view(x, 1:4))
+    @test typeof(x) == typeof(z)
+    @test z == x
+
+    x = [1,1,2,missing]
+    y = categorical(x)
+    z = convert(Vector, y)
+    @test typeof(x) == typeof(z)
+    @test z ≅ x
+    z = Vector(view(x, 1:4))
+    @test typeof(x) == typeof(z)
+    @test z ≅ x
+
+    x = [1 1 2 2]
+    y = categorical(x)
+    z = convert(Matrix, y)
+    @test typeof(x) == typeof(z)
+    @test z == x
+    z = Matrix(view(x, :, 1:4))
+    @test typeof(x) == typeof(z)
+    @test z == x
+
+    x = [1 1 2 missing]
+    y = categorical(x)
+    z = convert(Matrix, y)
+    @test typeof(x) == typeof(z)
+    @test z ≅ x
+    z = Matrix(view(x, :, 1:4))
+    @test typeof(x) == typeof(z)
+    @test z ≅ x
 end
 
 @testset "Array{T} constructors and convert" begin

From 7badc9ec6c2f3fe29fb2abd9a02996f2b240c566 Mon Sep 17 00:00:00 2001
From: Milan Bouchet-Valat <nalimilan@club.fr>
Date: Thu, 31 Jul 2025 11:37:03 +0200
Subject: [PATCH 24/25] Make `levels` return a `CategoricalArray` (#425)

Having `levels` preserve the eltype of the input is sometimes useful
to write generic code. This is only slightly breaking as the result
still compares equal to the previous behavior returning unwrapped values.
---
 benchmark/benchmarks.jl                |  4 +-
 docs/src/using.md                      | 18 +++----
 ext/CategoricalArraysArrowExt.jl       |  2 +
 ext/CategoricalArraysRecipesBaseExt.jl |  2 +-
 src/array.jl                           | 56 +++++++++++----------
 src/pool.jl                            | 29 ++++++-----
 src/recode.jl                          | 10 ++--
 src/typedefs.jl                        |  5 +-
 src/value.jl                           |  8 +--
 test/01_value.jl                       |  7 ++-
 test/07_levels.jl                      | 68 ++++++++++++++++++--------
 test/11_array.jl                       |  2 +-
 test/12_missingarray.jl                |  2 +-
 test/13_arraycommon.jl                 |  8 +--
 test/14_view.jl                        |  3 +-
 15 files changed, 135 insertions(+), 89 deletions(-)

diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
index 5c2ae42b..bf12f7c9 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/benchmarks.jl
@@ -55,11 +55,11 @@ SUITE["many levels"]["CategoricalArray(::Vector{String})"] =
 a = rand([@sprintf("id%010d", k) for k in 1:1000], 10000)
 ca = CategoricalArray(a)
 
-levs = levels(ca)
+levs = unwrap.(levels(ca))
 SUITE["many levels"]["levels! with original levels"] =
     @benchmarkable levels!(ca, levs)
 
-levs = reverse(levels(ca))
+levs = reverse(unwrap.(levels(ca)))
 SUITE["many levels"]["levels! with resorted levels"] =
     @benchmarkable levels!(ca, levs)
 
diff --git a/docs/src/using.md b/docs/src/using.md
index 9790e8cf..24c452b0 100644
--- a/docs/src/using.md
+++ b/docs/src/using.md
@@ -20,7 +20,7 @@ By default, the levels are lexically sorted, which is clearly not correct in our
 
 ```jldoctest using
 julia> levels(x)
-3-element Vector{String}:
+3-element CategoricalArray{String,1,UInt32}:
  "Middle"
  "Old"
  "Young"
@@ -68,7 +68,7 @@ To get rid of the `"Old"` group, just call the [`droplevels!`](@ref) function:
 
 ```jldoctest using
 julia> levels(x)
-3-element Vector{String}:
+3-element CategoricalArray{String,1,UInt32}:
  "Young"
  "Middle"
  "Old"
@@ -81,7 +81,7 @@ julia> droplevels!(x)
  "Young"
 
 julia> levels(x)
-2-element Vector{String}:
+2-element CategoricalArray{String,1,UInt32}:
  "Young"
  "Middle"
 
@@ -139,7 +139,7 @@ Levels still need to be reordered manually:
 
 ```jldoctest using
 julia> levels(y)
-3-element Vector{String}:
+3-element CategoricalArray{String,1,UInt32}:
  "Middle"
  "Old"
  "Young"
@@ -251,7 +251,7 @@ julia> xy = vcat(x, y)
  "Middle"
 
 julia> levels(xy)
-3-element Vector{String}:
+3-element CategoricalArray{String,1,UInt32}:
  "Young"
  "Middle"
  "Old"
@@ -263,7 +263,7 @@ true
 Likewise, assigning a `CategoricalValue` from `y` to an entry in `x` expands the levels of `x` with all levels from `y`, *respecting the ordering of levels of both vectors if possible*:
 ```jldoctest using
 julia> levels(x)
-2-element Vector{String}:
+2-element CategoricalArray{String,1,UInt32}:
  "Middle"
  "Old"
 
@@ -271,7 +271,7 @@ julia> x[1] = y[1]
 CategoricalValue{String, UInt32} "Young" (1/2)
 
 julia> levels(x)
-3-element Vector{String}:
+3-element CategoricalArray{String,1,UInt32}:
  "Young"
  "Middle"
  "Old"
@@ -296,7 +296,7 @@ julia> ab = vcat(a, b)
  "c"
 
 julia> levels(ab)
-3-element Vector{String}:
+3-element CategoricalArray{String,1,UInt32}:
  "a"
  "b"
  "c"
@@ -320,7 +320,7 @@ julia> ab2 = vcat(a, b)
  "c"
 
 julia> levels(ab2)
-3-element Vector{String}:
+3-element CategoricalArray{String,1,UInt32}:
  "a"
  "b"
  "c"
diff --git a/ext/CategoricalArraysArrowExt.jl b/ext/CategoricalArraysArrowExt.jl
index 3e764122..811870d2 100644
--- a/ext/CategoricalArraysArrowExt.jl
+++ b/ext/CategoricalArraysArrowExt.jl
@@ -7,6 +7,8 @@ import Arrow: ArrowTypes
 const CATARRAY_ARROWNAME = Symbol("JuliaLang.CategoricalArrays.CategoricalArray")
 ArrowTypes.arrowname(::Type{<:CategoricalValue}) = CATARRAY_ARROWNAME
 ArrowTypes.arrowmetadata(::Type{CategoricalValue{T, R}}) where {T, R} = string(R)
+ArrowTypes.ArrowType(::Type{<:CategoricalValue{T}}) where {T} = T
+ArrowTypes.toarrow(x::CategoricalValue) = unwrap(x)
 
 ArrowTypes.arrowname(::Type{Union{<:CategoricalValue, Missing}}) = CATARRAY_ARROWNAME
 ArrowTypes.arrowmetadata(::Type{Union{CategoricalValue{T, R}, Missing}}) where {T, R} =
diff --git a/ext/CategoricalArraysRecipesBaseExt.jl b/ext/CategoricalArraysRecipesBaseExt.jl
index 2642f838..656f3e3d 100644
--- a/ext/CategoricalArraysRecipesBaseExt.jl
+++ b/ext/CategoricalArraysRecipesBaseExt.jl
@@ -9,7 +9,7 @@ else
 end
 
 RecipesBase.@recipe function f(::Type{T}, v::T) where T <: CategoricalValue
-    level_strings = [map(string, levels(v)); missing]
+    level_strings = [map(string, CategoricalArrays._levels(v)); missing]
     ticks --> eachindex(level_strings)
     v -> ismissing(v) ? length(level_strings) : Int(CategoricalArrays.refcode(v)),
     i -> level_strings[Int(i)]
diff --git a/src/array.jl b/src/array.jl
index 8101fb56..4d47e82c 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -240,7 +240,7 @@ function CategoricalArray{T, N, R}(A::CategoricalArray{S, N, Q};
         catch err
             err isa LevelsException || rethrow(err)
             throw(ArgumentError("encountered value(s) not in specified `levels`: " *
-                                "$(setdiff(CategoricalArrays.levels(res), levels))"))
+                                "$(setdiff(_levels(res), levels))"))
         end
     end
     return res
@@ -359,18 +359,18 @@ function _convert(::Type{CategoricalArray{T, N, R}}, A::AbstractArray{S, N};
     copyto!(res, A)
 
     if levels !== nothing
-        CategoricalArrays.levels(res) == levels ||
+        _levels(res) == levels ||
             throw(ArgumentError("encountered value(s) not in specified `levels`: " *
-                                "$(setdiff(CategoricalArrays.levels(res), levels))"))
+                                "$(setdiff(_levels(res), levels))"))
     else
         # if order is defined for level type, automatically apply it
         L = leveltype(res)
         if Base.OrderStyle(L) isa Base.Ordered
-            levels!(res, sort(CategoricalArrays.levels(res)))
+            levels!(res, sort(_levels(res)))
         elseif hasmethod(isless, (L, L))
             # isless may throw an error, e.g. for AbstractArray{T} of unordered T
             try
-                levels!(res, sort(CategoricalArrays.levels(res)))
+                levels!(res, sort(_levels(res)))
             catch e
                  e isa MethodError || rethrow(e)
             end
@@ -383,7 +383,7 @@ end
 # From CategoricalArray (preserve levels, ordering and R)
 function convert(::Type{CategoricalArray{T, N, R}}, A::CategoricalArray{S, N}) where {S, T, N, R}
     if length(A.pool) > typemax(R)
-        throw(LevelsException{T, R}(levels(A)[typemax(R)+1:end]))
+        throw(LevelsException{T, R}(_levels(A)[typemax(R)+1:end]))
     end
 
     if !(T >: Missing) && S >: Missing && any(iszero, A.refs)
@@ -467,7 +467,7 @@ size(A::CategoricalArray) = size(A.refs)
 Base.IndexStyle(::Type{<:CategoricalArray}) = IndexLinear()
 
 function update_refs!(A::CategoricalArray, newlevels::AbstractVector)
-    oldlevels = levels(A)
+    oldlevels = _levels(A)
     levelsmap = similar(A.refs, length(oldlevels)+1)
     # 0 maps to a missing value
     levelsmap[1] = 0
@@ -485,7 +485,7 @@ function merge_pools!(A::CatArrOrSub,
                       updaterefs::Bool=true,
                       updatepool::Bool=true)
     newlevels, ordered = merge_pools(pool(A), pool(B))
-    oldlevels = levels(A)
+    oldlevels = _levels(A)
     pA = A isa SubArray ? parent(A) : A
     ordered!(pA, ordered)
     # If A's levels are an ordered superset of new (merged) pool, no need to recompute refs
@@ -544,8 +544,8 @@ function copyto!(dest::CatArrOrSub{T, N, R}, dstart::Integer,
 
     # try converting src to dest type to avoid partial copy corruption of dest
     # in the event that the src cannot be copied into dest
-    slevs = convert(Vector{T}, levels(src))
-    dlevs = levels(dest)
+    slevs = convert(Vector{T}, _levels(src))
+    dlevs = _levels(dest)
     if eltype(src) >: Missing && !(eltype(dest) >: Missing) && !all(x -> x > 0, srefs)
         throw(MissingException("cannot copy array with missing values to an array with element type $T"))
     end
@@ -598,7 +598,7 @@ function copyto!(dest::CatArrOrSub{T1, N, R}, dstart::Integer,
         return invoke(copyto!, Tuple{AbstractArray, Integer, AbstractArray, Integer, Integer},
                       dest, dstart, src, sstart, n)
     end
-    newdestlevs = destlevs = copy(levels(dest)) # copy since we need original levels below
+    newdestlevs = destlevs = copy(_levels(dest)) # copy since we need original levels below
     srclevsnm = T2 >: Missing ? setdiff(srclevs, [missing]) : srclevs
     if !(srclevsnm ⊆ destlevs)
         # if order is defined for level type, automatically apply it
@@ -708,7 +708,7 @@ While this will reduce memory use, this function is type-unstable, which can aff
 performance inside the function where the call is made. Therefore, use it with caution.
 """
 function compress(A::CategoricalArray{T, N}) where {T, N}
-    R = reftype(length(levels(A.pool)))
+    R = reftype(length(_levels(A.pool)))
     convert(CategoricalArray{T, N, R}, A)
 end
 
@@ -726,11 +726,11 @@ decompress(A::CategoricalArray{T, N}) where {T, N} =
     convert(CategoricalArray{T, N, DefaultRefType}, A)
 
 function vcat(A::CategoricalArray...)
-    ordered = any(isordered, A) && all(a->isordered(a) || isempty(levels(a)), A)
-    newlevels, ordered = mergelevels(ordered, map(levels, A)...)
+    ordered = any(isordered, A) && all(a->isordered(a) || isempty(_levels(a)), A)
+    newlevels, ordered = mergelevels(ordered, map(_levels, A)...)
 
     refsvec = map(A) do a
-        ii = convert(Vector{Int}, indexin(levels(a.pool), newlevels))
+        ii = convert(Vector{Int}, indexin(_levels(a.pool), newlevels))
         [x==0 ? 0 : ii[x] for x in a.refs]::Array{Int,ndims(a)}
     end
 
@@ -768,23 +768,25 @@ This may include levels which do not actually appear in the data
 `missing` will be included only if it appears in the data and
 `skipmissing=false` is passed.
 
-The returned vector is an internal field of `x` which must not be mutated
+The returned vector is owned by `x` and must not be mutated
 as doing so would corrupt it.
 """
-@inline function DataAPI.levels(A::CatArrOrSub{T}; skipmissing::Bool=true) where T
+@inline function DataAPI.levels(A::CatArrOrSub; skipmissing::Bool=true)
     if eltype(A) >: Missing && !skipmissing
         if any(==(0), refs(A))
-            T[levels(pool(A)); missing]
+            eltype(A)[levels(pool(A)); missing]
         else
-            convert(Vector{T}, levels(pool(A)))
+            levels_missing(pool(A))
         end
     else
         levels(pool(A))
     end
 end
 
+_levels(A::CatArrOrSub) = _levels(pool(A))
+
 """
-    levels!(A::CategoricalArray, newlevels::Vector; allowmissing::Bool=false)
+    levels!(A::CategoricalArray, newlevels::AbstractVector; allowmissing::Bool=false)
 
 Set the levels categorical array `A`. The order of appearance of levels will be respected
 by [`levels`](@ref DataAPI.levels), which may affect display of results in some operations; if `A` is
@@ -798,7 +800,7 @@ Else, `newlevels` must include all levels which appear in the data.
 """
 function levels!(A::CategoricalArray{T, N, R}, newlevels::AbstractVector;
                  allowmissing::Bool=false) where {T, N, R}
-    (levels(A) == newlevels) && return A # nothing to do
+    (_levels(A) == newlevels) && return A # nothing to do
 
     # map each new level to its ref code
     newlv2ref = Dict{eltype(newlevels), Int}()
@@ -813,7 +815,7 @@ function levels!(A::CategoricalArray{T, N, R}, newlevels::AbstractVector;
     end
 
     # map each old ref code to new ref code (or 0 if no such level)
-    oldlevels = levels(pool(A))
+    oldlevels = _levels(pool(A))
     oldref2newref = fill(0, length(oldlevels) + 1)
     for (i, lv) in enumerate(oldlevels)
         oldref2newref[i + 1] = get(newlv2ref, lv, 0)
@@ -874,7 +876,7 @@ end
 function _uniquerefs(A::CatArrOrSub{T}) where T
     arefs = refs(A)
     res = similar(arefs, 0)
-    nlevels = length(levels(A))
+    nlevels = length(_levels(A))
     maxunique = nlevels + (T >: Missing ? 1 : 0)
     seen = fill(false, nlevels + 1) # always +1 for 0 (missing ref)
     @inbounds for ref in arefs
@@ -907,7 +909,7 @@ returned by [`levels`](@ref DataAPI.levels)).
 """
 function droplevels!(A::CategoricalArray)
     arefs = refs(A)
-    nlevels = length(levels(A)) + 1 # +1 for missing
+    nlevels = length(_levels(A)) + 1 # +1 for missing
     seen = fill(false, nlevels)
     seen[1] = true # assume that missing is always observed to simplify checks
     nseen = 1
@@ -920,7 +922,7 @@ function droplevels!(A::CategoricalArray)
     end
 
     # replace the pool
-    A.pool = typeof(pool(A))(@inbounds(levels(A)[view(seen, 2:nlevels)]), isordered(A))
+    A.pool = typeof(pool(A))(@inbounds(_levels(A)[view(seen, 2:nlevels)]), isordered(A))
     # recode refs to keep only the seen ones (optimized version of update_refs!())
     seen[1] = false # to start levelsmap from 0
     levelsmap = cumsum(seen)
@@ -1037,7 +1039,7 @@ end
                              ordered=_isordered(A),
                              compress::Bool=false) where {T, N, R}
     # @inline is needed so that return type is inferred when compress is not provided
-    RefType = compress ? reftype(length(CategoricalArrays.levels(A))) : R
+    RefType = compress ? reftype(length(_levels(A))) : R
     CategoricalArray{T, N, RefType}(A, levels=levels, ordered=ordered)
 end
 
@@ -1050,7 +1052,7 @@ function in(x::CategoricalValue, y::CategoricalArray{T, N, R}) where {T, N, R}
     if x.pool === y.pool
         return refcode(x) in y.refs
     else
-        ref = get(y.pool, levels(x.pool)[refcode(x)], zero(R))
+        ref = get(y.pool, _levels(x.pool)[refcode(x)], zero(R))
         return ref != 0 ? ref in y.refs : false
     end
 end
diff --git a/src/pool.jl b/src/pool.jl
index 9753a76d..2df7e345 100644
--- a/src/pool.jl
+++ b/src/pool.jl
@@ -21,8 +21,8 @@ Base.convert(::Type{CategoricalPool{S}}, pool::CategoricalPool{T, R}) where {S,
     convert(CategoricalPool{S, R}, pool)
 
 function Base.convert(::Type{CategoricalPool{T, R}}, pool::CategoricalPool) where {T, R <: Integer}
-    if length(levels(pool)) > typemax(R)
-        throw(LevelsException{T, R}(levels(pool)[typemax(R)+1:end]))
+    if length(pool.levels) > typemax(R)
+        throw(LevelsException{T, R}(pool.levels[typemax(R)+1:end]))
     end
 
     levelsT = convert(Vector{T}, pool.levels)
@@ -37,10 +37,10 @@ Base.copy(pool::CategoricalPool{T, R}) where {T, R} =
 function Base.show(io::IO, pool::CategoricalPool{T, R}) where {T, R}
     @static if VERSION >= v"1.6.0"
         @printf(io, "%s{%s, %s}([%s])", CategoricalPool, T, R,
-                join(map(repr, levels(pool)), ", "))
+                join(map(repr, pool.levels), ", "))
     else
         @printf(io, "%s{%s,%s}([%s])", CategoricalPool, T, R,
-                join(map(repr, levels(pool)), ", "))
+                join(map(repr, pool.levels), ", "))
     end
 
     pool.ordered && print(io, " with ordered levels")
@@ -65,6 +65,7 @@ it doesn't do this itself to avoid doing a dict lookup twice
 
     i = R(n + 1)
     push!(pool.levels, x)
+    push!(pool.levelsinds, i)
     pool_hash = pool.hash
     if pool_hash !== nothing
         pool.hash = hash(x, pool_hash)
@@ -185,10 +186,10 @@ function merge_pools(a::CategoricalPool{T}, b::CategoricalPool) where {T}
         newlevs = T[]
         ordered = isordered(a)
     elseif length(a) == 0
-        newlevs = Vector{T}(levels(b))
+        newlevs = Vector{T}(b.levels)
         ordered = isordered(b)
     elseif length(b) == 0
-        newlevs = copy(levels(a))
+        newlevs = copy(a.levels)
         ordered = isordered(a)
     else
         ordered = isordered(a) && (isordered(b) || b ⊆ a)
@@ -200,7 +201,7 @@ end
 
 @inline function Base.hash(pool::CategoricalPool, h::UInt)
     if pool.hash === nothing
-        pool.hash = hashlevels(levels(pool))
+        pool.hash = hashlevels(pool.levels)
     end
     hash(pool.hash, h)
 end
@@ -246,9 +247,9 @@ end
 
 # Contrary to the CategoricalArray one, this method only allows adding new levels at the end
 # so that existing CategoricalValue objects still point to the same value
-function levels!(pool::CategoricalPool{S, R}, newlevels::Vector;
+function levels!(pool::CategoricalPool{S, R}, newlevels::AbstractVector;
                  checkunique::Bool=true) where {S, R}
-    levs = convert(Vector{S}, newlevels)
+    levs = newlevels isa CategoricalVector{S} ? newlevels : convert(Vector{S}, newlevels)
     if checkunique && !allunique(levs)
         throw(ArgumentError(string("duplicated levels found in levs: ",
                                    join(unique(filter(x->sum(levs.==x)>1, levs)), ", "))))
@@ -259,24 +260,30 @@ function levels!(pool::CategoricalPool{S, R}, newlevels::Vector;
     n = length(levs)
 
     if n > typemax(R)
-        throw(LevelsException{S, R}(setdiff(levs, levels(pool))[typemax(R)-length(levels(pool))+1:end]))
+        throw(LevelsException{S, R}(setdiff(levs, pool.levels)[typemax(R)-length(pool.levels)+1:end]))
     end
 
     empty!(pool.invindex)
     resize!(pool.levels, n)
+    resize!(pool.levelsinds, n)
     pool.hash = nothing
     pool.equalto = C_NULL
     pool.subsetof = C_NULL
     for i in 1:n
         v = levs[i]
         pool.levels[i] = v
+        pool.levelsinds[i] = i
         pool.invindex[v] = i
     end
 
     return pool
 end
 
-DataAPI.levels(pool::CategoricalPool) = pool.levels
+DataAPI.levels(pool::CategoricalPool{T}) where {T} =
+    CategoricalVector{T}(pool.levelsinds, pool)
+levels_missing(pool::CategoricalPool{T}) where {T} =
+    CategoricalVector{Union{T, Missing}}(pool.levelsinds, pool)
+_levels(pool::CategoricalPool) = pool.levels
 
 isordered(pool::CategoricalPool) = pool.ordered
 ordered!(pool::CategoricalPool, ordered) = (pool.ordered = ordered; pool)
diff --git a/src/recode.jl b/src/recode.jl
index 141f9967..ff258e60 100644
--- a/src/recode.jl
+++ b/src/recode.jl
@@ -111,7 +111,7 @@ function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, defau
     levels!(dest.pool, filter!(!ismissing, unique(vals)))
     # In the absence of duplicated recoded values, we do not need to lookup the reference
     # for each pair in the loop, which is more efficient (with loop unswitching)
-    dupvals = length(vals) != length(levels(dest.pool))
+    dupvals = length(vals) != length(_levels(dest.pool))
 
     drefs = dest.refs
     pairmap = [ismissing(v) ? zero(R) : get(dest.pool, v) for v in vals]
@@ -150,7 +150,7 @@ function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, defau
 
     # Put existing levels first, and sort them if possible
     # for consistency with CategoricalArray
-    oldlevels = setdiff(levels(dest), vals)
+    oldlevels = setdiff(_levels(dest), vals)
     filter!(!ismissing, oldlevels)
     L = eltype(oldlevels)
     if Base.OrderStyle(L) isa Base.Ordered
@@ -163,7 +163,7 @@ function _recode!(dest::CategoricalArray{T, <:Any, R}, src::AbstractArray, defau
             e isa MethodError || rethrow(e)
         end
     end
-    levels!(dest, union(oldlevels, levels(dest)))
+    levels!(dest, union(oldlevels, _levels(dest)))
 
     dest
 end
@@ -174,7 +174,7 @@ function _recode!(dest::CategoricalArray{T, N, R}, src::CategoricalArray,
     vals = T[p.second for p in pairs]
              
     if default === nothing
-        srclevels = levels(src)
+        srclevels = _levels(src)
 
         # Remove recoded levels as they won't appear in result
         keptlevels = Vector{T}(undef, 0)
@@ -201,7 +201,7 @@ function _recode!(dest::CategoricalArray{T, N, R}, src::CategoricalArray,
         ordered = false
     end
 
-    srclevels = src.pool === dest.pool ? copy(levels(src.pool)) : levels(src.pool)
+    srclevels = src.pool === dest.pool ? copy(_levels(src.pool)) : _levels(src.pool)
     if length(levs) > length(srclevels) && view(levs, 1:length(srclevels)) == srclevels
         levels!(dest.pool, levs)
     else
diff --git a/src/typedefs.jl b/src/typedefs.jl
index 0f9aa414..238bb995 100644
--- a/src/typedefs.jl
+++ b/src/typedefs.jl
@@ -8,6 +8,7 @@ const SupportedTypes = Union{AbstractString, AbstractChar, Number}
 # * `R` integer type for referencing category levels
 mutable struct CategoricalPool{T <: SupportedTypes, R <: Integer}
     levels::Vector{T}          # category levels ordered by their reference codes
+    levelsinds::Vector{R}      # set to 1:length(levels), used by `levels(p)`
     invindex::Dict{T, R}       # map from category levels to their reference codes
     ordered::Bool              # whether levels can be compared using <
     hash::Union{UInt, Nothing} # hash of levels
@@ -45,8 +46,8 @@ mutable struct CategoricalPool{T <: SupportedTypes, R <: Integer}
                                    invindex::Dict{T, R},
                                    ordered::Bool,
                                    hash::Union{UInt, Nothing}=nothing) where {T, R}
-        pool = new(levels, invindex, ordered, hash, C_NULL, C_NULL)
-        return pool
+        return new(levels, 1:length(levels), invindex,
+                   ordered, hash, C_NULL, C_NULL)
     end
 end
 
diff --git a/src/value.jl b/src/value.jl
index ae962adb..a1633204 100644
--- a/src/value.jl
+++ b/src/value.jl
@@ -27,6 +27,8 @@ reftype(x::Any) = reftype(typeof(x))
 pool(x::CategoricalValue) = x.pool
 refcode(x::CategoricalValue) = x.ref
 isordered(x::CategoricalValue) = isordered(x.pool)
+DataAPI.levels(x::CategoricalValue) = levels(pool(x))
+_levels(x::CategoricalValue) = _levels(pool(x))
 
 # extract the type of the original value from array eltype `T`
 unwrap_catvaluetype(::Type{T}) where {T} = T
@@ -42,7 +44,7 @@ unwrap_catvaluetype(::Type{T}) where {T <: CategoricalValue} = leveltype(T)
 
 Get the value wrapped by categorical value `x`. If `x` is `Missing` return `missing`.
 """
-DataAPI.unwrap(x::CategoricalValue) = levels(x)[refcode(x)]
+DataAPI.unwrap(x::CategoricalValue) = _levels(x)[refcode(x)]
 
 """
     levelcode(x::CategoricalValue)
@@ -59,10 +61,8 @@ Return `missing`.
 """
 levelcode(x::Missing) = missing
 
-DataAPI.levels(x::CategoricalValue) = levels(pool(x))
-
 function cat_promote_type(::Type{S}, ::Type{T}) where {S, T}
-    U = promote_type(S, T)
+    U = promote_type(unwrap_catvaluetype(S), unwrap_catvaluetype(T))
     U <: Union{SupportedTypes, Missing} ?
         U : typeintersect(Union{SupportedTypes, Missing}, Union{S, T})
 end
diff --git a/test/01_value.jl b/test/01_value.jl
index 39f58b67..8c60ae7f 100644
--- a/test/01_value.jl
+++ b/test/01_value.jl
@@ -22,6 +22,8 @@ end
     for i in 1:3
         x = CategoricalValue(pool, i)
 
+        @test levels(x) == levels(pool)
+        @test levels(x) isa CategoricalVector{String, UInt32}
         @test leveltype(x) === String
         @test leveltype(typeof(x)) === String
         @test reftype(x) === DefaultRefType
@@ -48,6 +50,8 @@ end
     for i in 1:3
         x = CategoricalValue(pool, i)
 
+        @test levels(x) == levels(pool)
+        @test levels(x) isa CategoricalVector{String, UInt8}
         @test leveltype(x) === String
         @test leveltype(typeof(x)) === String
         @test reftype(x) === UInt8
@@ -68,7 +72,8 @@ end
     for x in (CategoricalValue(pool, 1), arr, view(arr, 2:3))
         for (i, v) in enumerate(levels(pool))
             @test CategoricalValue(v, x) ===
-                CategoricalValue(float(v), x) ===
+                CategoricalValue(unwrap(v), x) ===
+                CategoricalValue(float(unwrap(v)), x) ===
                 CategoricalValue(CategoricalValue(pool, i), x) ===
                 CategoricalValue(pool, i)
         end
diff --git a/test/07_levels.jl b/test/07_levels.jl
index 25c54be0..b54e4d52 100644
--- a/test/07_levels.jl
+++ b/test/07_levels.jl
@@ -1,15 +1,16 @@
 module TestLevels
 using Test
 using CategoricalArrays
-using CategoricalArrays: DefaultRefType, levels!, hashlevels
+using CategoricalArrays: DefaultRefType, levels!, hashlevels, _levels
 
 @testset "CategoricalPool{Int} updates levels and order correctly" begin
     pool = CategoricalPool([2, 1, 3])
 
-    @test isa(levels(pool), Vector{Int})
+    @test isa(levels(pool), CategoricalVector{Int, DefaultRefType})
     @test length(pool) === 3
-    @test levels(pool) == [2, 1, 3]
-    @test all([levels(CategoricalValue(pool, i)) for i in 1:3] .=== Ref(levels(pool)))
+    @test levels(pool) == _levels(pool) == [2, 1, 3]
+    @test all([levels(CategoricalValue(pool, i)) for i in 1:3] .== Ref(levels(pool)))
+    @test pool.levelsinds == 1:3
     @test pool.invindex == Dict(2=>1, 1=>2, 3=>3)
     @test pool.hash === nothing
     @test pool.equalto == C_NULL
@@ -20,7 +21,8 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels
 
         @test isa(pool.levels, Vector{Int})
         @test length(pool) === 4
-        @test levels(pool) == [2, 1, 3, 4]
+        @test levels(pool) == _levels(pool) == [2, 1, 3, 4]
+        @test pool.levelsinds == 1:4
         @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4)
         @test pool.hash === nothing
         @test pool.equalto == C_NULL
@@ -34,7 +36,8 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels
 
         @test isa(pool.levels, Vector{Int})
         @test length(pool) === 5
-        @test levels(pool) == [2, 1, 3, 4, 0]
+        @test levels(pool) == _levels(pool) == [2, 1, 3, 4, 0]
+        @test pool.levelsinds == 1:5
         @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5)
         @test pool.hash === nothing
         @test pool.equalto == C_NULL
@@ -48,7 +51,8 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels
 
         @test isa(pool.levels, Vector{Int})
         @test length(pool) === 7
-        @test levels(pool) == [2, 1, 3, 4, 0, 10, 11]
+        @test levels(pool) == _levels(pool) == [2, 1, 3, 4, 0, 10, 11]
+        @test pool.levelsinds == 1:7
         @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7)
         @test pool.hash === nothing
         @test pool.equalto == C_NULL
@@ -64,7 +68,8 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels
 
         @test isa(pool.levels, Vector{Int})
         @test length(pool) === 9
-        @test levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13]
+        @test levels(pool) == _levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13]
+        @test pool.levelsinds == 1:9
         @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7, 12=>8, 13=>9)
         @test pool.hash === nothing
         @test pool.equalto == C_NULL
@@ -84,15 +89,10 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels
     # Adding levels while preserving existing ones
     levs = [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14]
     @test levels!(pool, levs) === pool
-    @test levels(pool) == levs
-    @test levels(pool) !== levs
-    @test pool.hash === nothing
-    @test pool.equalto == C_NULL
-    @test pool.subsetof == C_NULL
-
+    @test levels(pool) == _levels(pool) == levs
+    @test pool.levels !== levs
     @test isa(pool.levels, Vector{Int})
-    @test length(pool) === 11
-    @test levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14]
+    @test pool.levelsinds == 1:11
     @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7, 12=>8, 13=>9,
                                 15=>10, 14=>11)
     @test pool.hash === nothing
@@ -109,7 +109,10 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels
 
     @test isa(pool.levels, Vector{Int})
     @test length(pool) == 12
-    @test levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20]
+    @test levels(pool) ==
+        _levels(pool) ==
+        [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20]
+    @test pool.levelsinds == 1:12
     @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7, 12=>8, 13=>9,
                                 15=>10, 14=>11, 20=>12)
     @test pool.hash === nothing
@@ -128,7 +131,10 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels
 
     @test isa(pool.levels, Vector{Int})
     @test length(pool) == 14
-    @test levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20, 100, 99]
+    @test levels(pool) ==
+        _levels(pool) ==
+        [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20, 100, 99]
+    @test pool.levelsinds == 1:14
     @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7, 12=>8, 13=>9,
                                 15=>10, 14=>11, 20=>12, 100=>13, 99=>14)
     @test pool.hash === nothing
@@ -143,7 +149,10 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels
 
     @test isa(pool.levels, Vector{Int})
     @test length(pool) == 14
-    @test levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20, 100, 99]
+    @test levels(pool) ==
+        _levels(pool) ==
+        [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20, 100, 99]
+    @test pool.levelsinds == 1:14
     @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7, 12=>8, 13=>9,
                                 15=>10, 14=>11, 20=>12, 100=>13, 99=>14)
     @test pool.hash === CategoricalArrays.hashlevels(levels(pool))
@@ -155,7 +164,10 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels
 
     @test isa(pool.levels, Vector{Int})
     @test length(pool) == 14
-    @test levels(pool) == [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20, 100, 99]
+    @test levels(pool) ==
+        _levels(pool) ==
+        [2, 1, 3, 4, 0, 10, 11, 12, 13, 15, 14, 20, 100, 99]
+    @test pool.levelsinds == 1:14
     @test pool.invindex == Dict(2=>1, 1=>2, 3=>3, 4=>4, 0=>5, 10=>6, 11=>7, 12=>8, 13=>9,
                                 15=>10, 14=>11, 20=>12, 100=>13, 99=>14)
     @test pool.hash === CategoricalArrays.hashlevels(levels(pool))
@@ -178,6 +190,22 @@ using CategoricalArrays: DefaultRefType, levels!, hashlevels
     @test !isordered(p2)
 end
 
+@testset "levels!(::CategoricalPool, ::CategoricalVector)" begin
+    pool = CategoricalPool([2, 1, 3])
+    levels!(pool, categorical([2, 1, 3, 4]))
+    @test levels(pool) == [2, 1, 3, 4]
+
+    pool = CategoricalPool([2, 1, 3])
+    levels!(pool, categorical([2.0, 1.0, 3.0, 4.0]))
+    @test levels(pool) == [2, 1, 3, 4]
+
+    pool = CategoricalPool([2, 1, 3])
+    @test_throws ArgumentError levels!(pool, categorical([2, 2, 1, 3, 4]))
+
+    pool = CategoricalPool([2, 1, 3])
+    @test_throws ArgumentError levels!(pool, categorical(1:3))
+end
+
 @testset "overflow of reftype is detected and doesn't corrupt levels" begin
     res = @test_throws LevelsException{Int, UInt8} CategoricalPool{Int, UInt8}(collect(256:-1:1))
     @test res.value.levels == [1]
diff --git a/test/11_array.jl b/test/11_array.jl
index b474cfe1..4f332640 100644
--- a/test/11_array.jl
+++ b/test/11_array.jl
@@ -746,7 +746,7 @@ end
     @test y == unique(x)
 
     x = CategoricalArray(String[])
-    @test isa(levels(x), Vector{String}) && isempty(levels(x))
+    @test isa(levels(x), CategoricalVector{String, DefaultRefType}) && isempty(levels(x))
     @test isa(unique(x), typeof(x)) && isempty(unique(x))
     @test levels!(x, ["Young", "Middle", "Old"]) === x
     @test levels(x) == ["Young", "Middle", "Old"]
diff --git a/test/12_missingarray.jl b/test/12_missingarray.jl
index a2204e40..5c2ed3a9 100644
--- a/test/12_missingarray.jl
+++ b/test/12_missingarray.jl
@@ -1160,7 +1160,7 @@ end
     @test unique(x) ≅ ["Old", "Young", "Middle", missing]
 
     x = CategoricalArray((Union{String, Missing})[missing])
-    @test isa(levels(x), Vector{String}) && isempty(levels(x))
+    @test isa(levels(x), CategoricalVector{String, DefaultRefType}) && isempty(levels(x))
     @test unique(x) ≅ [missing]
     @test levels!(x, ["Young", "Middle", "Old"]) === x
     @test levels(x) == ["Young", "Middle", "Old"]
diff --git a/test/13_arraycommon.jl b/test/13_arraycommon.jl
index 2cf3ff6b..e95be673 100644
--- a/test/13_arraycommon.jl
+++ b/test/13_arraycommon.jl
@@ -2424,18 +2424,18 @@ end
               view(categorical(Union{String, Missing}[missing, "b", "a"], levels=["b", "c", "a"]), 2:3))
         @test @inferred(levels(x)) == ["b", "c", "a"]
         @test levels(x, skipmissing=true) == ["b", "c", "a"]
-        @test levels(x, skipmissing=true) isa Vector{String}
+        @test levels(x, skipmissing=true) isa CategoricalVector{String}
         @test levels(x, skipmissing=false) == ["b", "c", "a"]
-        @test levels(x, skipmissing=false) isa Vector{Union{String, Missing}}
+        @test levels(x, skipmissing=false) isa CategoricalVector{Union{String, Missing}}
     end
 
     for x in (categorical(Union{String, Missing}["a", "b", missing], levels=["b", "c", "a"]),
               view(categorical(Union{String, Missing}["c", "b", missing], levels=["b", "c", "a"]), 2:3))
         @test @inferred(levels(x)) == ["b", "c", "a"]
         @test levels(x, skipmissing=true) == ["b", "c", "a"]
-        @test levels(x, skipmissing=true) isa Vector{String}
+        @test levels(x, skipmissing=true) isa CategoricalVector{String}
         @test levels(x, skipmissing=false) ≅ ["b", "c", "a", missing]
-        @test levels(x, skipmissing=false) isa Vector{Union{String, Missing}}
+        @test levels(x, skipmissing=false) isa CategoricalVector{Union{String, Missing}}
     end
 end
 
diff --git a/test/14_view.jl b/test/14_view.jl
index 79b20812..11853853 100644
--- a/test/14_view.jl
+++ b/test/14_view.jl
@@ -11,7 +11,8 @@ const ≅ = isequal
 
     x = CategoricalArray{Union{T, eltype(a)}}(a, ordered=order)
     v = view(x, inds)
-    @test levels(v) === levels(x)
+    @test levels(x) isa CategoricalVector{nonmissingtype(eltype(a))}
+    @test levels(v) == levels(x)
     @test unique(v) == (ndims(v) > 0 ? unique(a[inds]) : [a[inds]])
     @test isordered(v) === isordered(x)
 end

From 49e200af3edea6fc528feb10a8a3a88f8b88f79d Mon Sep 17 00:00:00 2001
From: Milan Bouchet-Valat <nalimilan@club.fr>
Date: Thu, 31 Jul 2025 12:42:10 +0200
Subject: [PATCH 25/25] Release version 1.0 (#426)

---
 NEWS.md      | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 Project.toml |  2 +-
 2 files changed, 49 insertions(+), 1 deletion(-)
 create mode 100644 NEWS.md

diff --git a/NEWS.md b/NEWS.md
new file mode 100644
index 00000000..61842496
--- /dev/null
+++ b/NEWS.md
@@ -0,0 +1,48 @@
+# CategoricalArrays.jl v1.0.0 Release Notes
+
+## Breaking changes
+
+* `unique(::CategoricalArray)` and `levels(::CategoricalArray)` return
+  a `CategoricalArray` instead of unwrapping values, consistent with
+  `unique(::AbstractArray)` in Base and `levels(::AbstractArray)` in DataAPI
+  ([#358](https://github.com/JuliaData/CategoricalArrays.jl/pull/358),
+  [#425](https://github.com/JuliaData/CategoricalArrays.jl/pull/425)).
+
+* `cut` always closes the last interval on the right
+  ([#409](https://github.com/JuliaData/CategoricalArrays.jl/pull/409)).
+
+* `cut(x, breaks)` rounds breaks to generate shorter labels
+ ([#422](https://github.com/JuliaData/CategoricalArrays.jl/pull/422)).
+
+* `cut(x, ngroups)` takes breaks from actual values instead of using
+  quantile estimates which are generally longer
+  ([#416](https://github.com/JuliaData/CategoricalArrays.jl/pull/416))
+  This only changes group labels, not their contents.
+
+* `T(::CategoricalArray{U})` and `convert(T, ::CategoricalArray{U})`
+  now consistently return an `Array{U}` for `T` in `Array`, `Vector`, `Matrix`.
+  This avoids creating `Array{<:CategoricalValue}` objects unless explicitly requested
+  ([#420](https://github.com/JuliaData/CategoricalArrays.jl/pull/420)).
+
+
+* All deprecations have been removed
+  ([#419](https://github.com/JuliaData/CategoricalArrays.jl/pull/419)).
+
+## New features
+
+* Support reading from and writing to Arrow files
+  ([#415](https://github.com/JuliaData/CategoricalArrays.jl/pull/415)).
+
+* Improve performance of `recode`
+  ([#407](https://github.com/JuliaData/CategoricalArrays.jl/pull/407)).
+
+* Support weighted quantiles in `cut`
+  ([#423](https://github.com/JuliaData/CategoricalArrays.jl/pull/423)).
+
+## Bug fixes
+
+* Fix performance regression on Julia 1.11 and above
+  ([#418](https://github.com/JuliaData/CategoricalArrays.jl/pull/418)).
+
+* Fix `cut` corner cases with duplicated breaks
+  ([#410](https://github.com/JuliaData/CategoricalArrays.jl/pull/410)).
diff --git a/Project.toml b/Project.toml
index a9262e93..83d5ba30 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "CategoricalArrays"
 uuid = "324d7699-5711-5eae-9e2f-1d82baa6b597"
-version = "0.10.8"
+version = "1.0.0"
 
 [deps]
 Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"