diff --git a/Project.toml b/Project.toml index d4108dd..0b90562 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "PooledArrays" uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" -version = "1.4.1" +version = "1.4.2" [deps] DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" @@ -12,6 +12,7 @@ julia = "1" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" [targets] -test = ["Test"] +test = ["OffsetArrays", "Test"] diff --git a/README.md b/README.md index 5c88703..df6e2c8 100644 --- a/README.md +++ b/README.md @@ -9,11 +9,67 @@ A pooled representation of arrays for purposes of compression when there are few unique elements. - **Installation**: at the Julia REPL, `import Pkg; Pkg.add("PooledArrays")` -**Maintenance**: PooledArrays is maintained collectively by the [JuliaData collaborators](https://github.com/orgs/JuliaData/people). -Responsiveness to pull requests and issues can vary, depending on the availability of key collaborators. +**Usage**: + +Working with `PooledArray` objects does not differ from working with general +`AbstractArray` objects, with two exceptions: +* If you hold mutable objects in `PooledArray` it is not allowed to modify them + after they are stored in it. +* In multi-threaded context it is not safe to assign values that are not already + present in a `PooledArray`'s pool from one thread while either reading or + writing to the same array from another thread. + +Keeping in mind these two restrictions, as a user, the only thing you need to +learn is how to create `PooledArray` objects. This is accomplished by passing +an `AbstractArray` to the `PooledArray` constructor: + +``` +julia> using PooledArrays + +julia> PooledArray(["a" "b"; "c" "d"]) +2×2 PooledMatrix{String, UInt32, Matrix{UInt32}}: + "a" "b" + "c" "d" + ``` + +`PooledArray` performs compression by storing an array of reference integers and +a mapping from integers to its elements in a dictionary. In this way, if the +size of the reference integer is smaller than the size of the actual elements +the resulting `PooledArray` has a smaller memory footprint than the equivalent +`Array`. By default `UInt32` is used as a type of reference integers. However, +you can specify the reference integer type you want to use by passing it as a +second argument to the constructor. This is usually done when you know that you +will have only a few unique elements in the `PooledArray`. + +``` +julia> PooledArray(["a", "b", "c", "d"], UInt8) +4-element PooledVector{String, UInt8, Vector{UInt8}}: + "a" + "b" + "c" + "d" + ``` + +Alternatively you can pass the `compress` and `signed` keyword arguments to the +`PooledArray` constructor to automatically select the reference integer type. +When you pass `compress=true` then the reference integer type is chosen to be +the smallest type that is large enough to hold all unique values in array. When +you pass `signed=true` the reference type is signed (by default it is unsigned). +``` +julia> PooledArray(["a", "b", "c", "d"]; compress=true, signed=true) +4-element PooledVector{String, Int8, Vector{Int8}}: + "a" + "b" + "c" + "d" +``` + +**Maintenance**: PooledArrays is maintained collectively by the +[JuliaData collaborators](https://github.com/orgs/JuliaData/people). +Responsiveness to pull requests and issues can vary, +depending on the availability of key collaborators. ## Related Packages diff --git a/src/PooledArrays.jl b/src/PooledArrays.jl index 2f4d0fe..7daf2b9 100644 --- a/src/PooledArrays.jl +++ b/src/PooledArrays.jl @@ -114,20 +114,25 @@ function _label(xs::AbstractArray, ) where {T, I<:Integer} @inbounds for i in start:length(xs) - x = xs[i] - lbl = get(invpool, x, zero(I)) - if lbl !== zero(I) - labels[i] = lbl + idx = i + firstindex(xs) - 1 + if !isassigned(xs, idx) + labels[i] = zero(I) else - if nlabels == typemax(I) - I2 = _widen(I) - return _label(xs, T, I2, i, convert(Vector{I2}, labels), - convert(Dict{T, I2}, invpool), pool, nlabels) + x = xs[idx] + lbl = get(invpool, x, zero(I)) + if lbl !== zero(I) + labels[i] = lbl + else + if nlabels == typemax(I) + I2 = _widen(I) + return _label(xs, T, I2, i, convert(Vector{I2}, labels), + convert(Dict{T, I2}, invpool), pool, nlabels) + end + nlabels += 1 + labels[i] = nlabels + invpool[x] = nlabels + push!(pool, x) end - nlabels += 1 - labels[i] = nlabels - invpool[x] = nlabels - push!(pool, x) end end labels, invpool, pool @@ -613,14 +618,14 @@ function Base.append!(pv::PooledVector, items::AbstractArray) return pv end -Base.pop!(pv::PooledVector) = pv.invpool[pop!(pv.refs)] +Base.pop!(pv::PooledVector) = pv.pool[pop!(pv.refs)] function Base.pushfirst!(pv::PooledVector{S,R}, v::T) where {S,R,T} pushfirst!(pv.refs, getpoolidx(pv, v)) return pv end -Base.popfirst!(pv::PooledVector) = pv.invpool[popfirst!(pv.refs)] +Base.popfirst!(pv::PooledVector) = pv.pool[popfirst!(pv.refs)] Base.empty!(pv::PooledVector) = (empty!(pv.refs); pv) diff --git a/test/runtests.jl b/test/runtests.jl index 5b36073..105a708 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,4 +1,4 @@ -using Test +using Test, OffsetArrays using PooledArrays using DataAPI: refarray, refvalue, refpool, invrefpool using PooledArrays: refcount @@ -588,3 +588,38 @@ end @test_throws BoundsError insert!(x, 9, true) @test x == [1, 1, 10, 99, 2, 3, 1] end + +@testset "pop! and popfirst!" begin + x = PooledArray([1, 2, 3]) + @test pop!(x) == 3 + @test x == [1, 2] + @test popfirst!(x) == 1 + @test x == [2] + x = PooledArray(["1", "2", "3"]) + @test pop!(x) == "3" + @test x == ["1", "2"] + @test popfirst!(x) == "1" + @test x == ["2"] +end + +@testset "constructor corner cases" begin + x = Vector{Any}(undef, 3) + y = PooledArray(x) + @test y isa PooledArray{Any} + @test !any(i -> isassigned(y, i), eachindex(y)) + @test all(iszero, y.refs) + @test isempty(y.pool) + @test isempty(y.invpool) + + x[2] = "a" + for v in (x, OffsetVector(x, -5)) + y = PooledArray(v) + @test y isa PooledArray{Any} + @test !isassigned(x, 1) + @test x[2] == "a" + @test !isassigned(x, 3) + @test y.refs == [0, 1, 0] + @test y.pool == ["a"] + @test y.invpool == Dict("a" => 1) + end +end