diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml new file mode 100644 index 0000000..778c06f --- /dev/null +++ b/.github/workflows/TagBot.yml @@ -0,0 +1,14 @@ +name: TagBot +on: + issue_comment: + types: + - created + workflow_dispatch: +jobs: + TagBot: + if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot' + runs-on: ubuntu-latest + steps: + - uses: JuliaRegistries/TagBot@v1 + with: + token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..e883daf --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,67 @@ +name: CI +on: + pull_request: + branches: + - main + push: + branches: + - main + tags: '*' +jobs: + test: + name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + version: + - '1.0' + - '1' # automatically expands to the latest stable 1.x release of Julia + - 'nightly' + os: + - ubuntu-latest + arch: + - x64 + include: + - os: windows-latest + version: '1' + arch: x86 + steps: + - uses: actions/checkout@v2 + - uses: julia-actions/setup-julia@v1 + with: + version: ${{ matrix.version }} + arch: ${{ matrix.arch }} + - uses: actions/cache@v1 + env: + cache-name: cache-artifacts + with: + path: ~/.julia/artifacts + key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }} + restore-keys: | + ${{ runner.os }}-test-${{ env.cache-name }}- + ${{ runner.os }}-test- + ${{ runner.os }}- + - uses: julia-actions/julia-buildpkg@v1 + - uses: julia-actions/julia-runtest@v1 + - uses: julia-actions/julia-processcoverage@v1 + - uses: codecov/codecov-action@v1 + with: + file: lcov.info + # docs: + # name: Documentation + # runs-on: ubuntu-latest + # steps: + # - uses: actions/checkout@v2 + # - uses: julia-actions/setup-julia@v1 + # with: + # version: '1' + # - run: | + # julia --project=docs -e ' + # using Pkg + # Pkg.develop(PackageSpec(path=pwd())) + # Pkg.instantiate()' + # - run: julia --project=docs docs/make.jl + # env: + # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index f703d29..0000000 --- a/.travis.yml +++ /dev/null @@ -1,22 +0,0 @@ -# Documentation: http://docs.travis-ci.com/user/languages/julia/ -language: julia - -os: - - linux - - osx - -julia: - - "1.0" - - "1.1" - - "nightly" - -matrix: - allow_failures: - - julia: "nightly" - fast_finish: true - -notifications: - email: false - -after_success: - - julia -e 'using Pkg; cd(Pkg.dir("PooledArrays")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder()); Codecov.submit(Codecov.process_folder())' diff --git a/Project.toml b/Project.toml index 3b262e8..1b6c1f7 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "PooledArrays" uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" -version = "0.5.3" +version = "1.0.0" [deps] DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" diff --git a/README.md b/README.md index 676ae64..5c88703 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,20 @@ -| Build | Test Coverage | -|-------|---------------| -| [![Build Status](https://travis-ci.org/JuliaData/PooledArrays.jl.svg?branch=master)](https://travis-ci.org/JuliaData/PooledArrays.jl) | [![codecov](https://codecov.io/gh/JuliaData/PooledArrays.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/JuliaData/PooledArrays.jl) | - # PooledArrays.jl +[![CI](https://github.com/JuliaData/PooledArrays.jl/workflows/CI/badge.svg)](https://github.com/JuliaData/PooledArrays.jl/actions?query=workflow%3ACI) +[![codecov](https://codecov.io/gh/JuliaData/PooledArrays.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/JuliaData/PooledArrays.jl) +[![deps](https://juliahub.com/docs/PooledArrays/deps.svg)](https://juliahub.com/ui/Packages/PooledArrays/vi11X?t=2) +[![version](https://juliahub.com/docs/PooledArrays/version.svg)](https://juliahub.com/ui/Packages/PooledArrays/vi11X) +[![pkgeval](https://juliahub.com/docs/PooledArrays/pkgeval.svg)](https://juliahub.com/ui/Packages/PooledArrays/vi11X) + + A pooled representation of arrays for purposes of compression when there are few unique elements. + +**Installation**: at the Julia REPL, `import Pkg; Pkg.add("PooledArrays")` + +**Maintenance**: PooledArrays is maintained collectively by the [JuliaData collaborators](https://github.com/orgs/JuliaData/people). +Responsiveness to pull requests and issues can vary, depending on the availability of key collaborators. + ## Related Packages - [IndirectArrays](https://github.com/JuliaArrays/IndirectArrays.jl) diff --git a/src/PooledArrays.jl b/src/PooledArrays.jl index 671649f..9479039 100644 --- a/src/PooledArrays.jl +++ b/src/PooledArrays.jl @@ -11,6 +11,7 @@ export PooledArray, PooledVector, PooledMatrix ############################################################################## const DEFAULT_POOLED_REF_TYPE = UInt32 +const DEFAULT_SIGNED_REF_TYPE = Int32 # This is used as a wrapper during PooledArray construction only, to distinguish # arrays of pool indices from normal arrays @@ -67,7 +68,7 @@ PooledArray(d::PooledArray) = copy(d) function _label(xs::AbstractArray, ::Type{T}=eltype(xs), - ::Type{I}=UInt8, + ::Type{I}=DEFAULT_POOLED_REF_TYPE, start = 1, labels = Array{I}(undef, size(xs)), invpool::Dict{T,I} = Dict{T, I}(), @@ -87,8 +88,8 @@ function _label(xs::AbstractArray, convert(Dict{T, I2}, invpool), pool, nlabels) end nlabels += 1 - labels[i] = convert(I, nlabels) - invpool[x] = convert(I, nlabels) + labels[i] = nlabels + invpool[x] = nlabels push!(pool, x) end end @@ -98,15 +99,25 @@ end _widen(::Type{UInt8}) = UInt16 _widen(::Type{UInt16}) = UInt32 _widen(::Type{UInt32}) = UInt64 - +_widen(::Type{Int8}) = Int16 +_widen(::Type{Int16}) = Int32 +_widen(::Type{Int32}) = Int64 # Constructor from array, invpool, and ref type """ - PooledArray(array, [reftype]) - -Convert the given array to a PooledArray where each element will be referenced -as an integer of the given type. If no `reftype` is specified one is chosen -automatically based on the number of unique elements. + PooledArray(array, [reftype]; signed=false, compress=false) + +Freshly allocate `PooledArray` using the given array as a source where each +element will be referenced as an integer of the given type. +If no `reftype` is specified one is chosen automatically based on the number of unique elements. +The Boolean keyword arguments, `signed` and `compress` determine the choice of `reftype`. +By default, unsigned integers are used, as they have a greater maxtype than the same size of +signed integer. However, the Arrow standard at https://arrow.apache.org/, as implemented in +the Arrow package, requires signed integer types, which are provided when `signed` is `true`. +The `compress` argument controls whether the default size of 32 bits is used (`UInt32` for +unsigned, `Int32` for signed) or if smaller integer types are chosen when they can be used. +If `array` is not a `PooledArray` then the order of elements in `refpool` in the resulting +`PooledArray` is the order of first appereance of elements in `array`. """ PooledArray @@ -121,13 +132,16 @@ function PooledArray{T}(d::AbstractArray, r::Type{R}) where {T,R<:Integer} PooledArray(RefArray(refs::Vector{R}), invpool::Dict{T,R}, pool) end -function PooledArray{T}(d::AbstractArray) where T - refs, invpool, pool = _label(d, T) +function PooledArray{T}(d::AbstractArray; signed::Bool=false, compress::Bool=false) where {T} + R = signed ? (compress ? Int8 : DEFAULT_SIGNED_REF_TYPE) : (compress ? UInt8 : DEFAULT_POOLED_REF_TYPE) + refs, invpool, pool = _label(d, T, R) PooledArray(RefArray(refs), invpool, pool) end PooledArray(d::AbstractArray{T}, r::Type) where {T} = PooledArray{T}(d, r) -PooledArray(d::AbstractArray{T}) where {T} = PooledArray{T}(d) +function PooledArray(d::AbstractArray{T}; signed::Bool=false, compress::Bool=false) where {T} + PooledArray{T}(d, signed=signed, compress=compress) +end # Construct an empty PooledVector of a specific type PooledArray(t::Type) = PooledArray(Array(t,0)) @@ -376,9 +390,8 @@ end ############################################################################## function Base.push!(pv::PooledVector{S,R}, v::T) where {S,R,T} - v = convert(S,v) push!(pv.refs, getpoolidx(pv, v)) - return v + return pv end function Base.append!(pv::PooledVector, items::AbstractArray) @@ -393,9 +406,8 @@ end Base.pop!(pv::PooledVector) = pv.invpool[pop!(pv.refs)] function Base.pushfirst!(pv::PooledVector{S,R}, v::T) where {S,R,T} - v = convert(S,v) pushfirst!(pv.refs, getpoolidx(pv, v)) - return v + return pv end Base.popfirst!(pv::PooledVector) = pv.invpool[popfirst!(pv.refs)] @@ -445,7 +457,10 @@ function Base.vcat(a::PooledArray{T, <:Integer, 1}, b::PooledArray{S, <:Integer, return PooledArray(RefArray(newrefs), convert(Dict{U, refT}, newlabels)) end -function fast_sortable(y::PooledArray) +fast_sortable(y::PooledArray) = _fast_sortable(y) +fast_sortable(y::PooledArray{T}) where {T<:Integer} = isbitstype(T) ? y : _fast_sortable(y) + +function _fast_sortable(y::PooledArray) poolranks = invperm(sortperm(y.pool)) newpool = Dict(j=>convert(eltype(y.refs), i) for (i,j) in enumerate(poolranks)) PooledArray(RefArray(y.refs), newpool) diff --git a/test/runtests.jl b/test/runtests.jl index 4687332..a503773 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,7 +2,11 @@ using Test using PooledArrays using DataAPI: refarray, refvalue, refpool -let a = rand(10), b = rand(10,10), c = rand(1:10, 1000) +@testset "PooledArrays" begin + a = rand(10) + b = rand(10,10) + c = rand(1:10, 1000) + @test PooledArray(a) == a @test PooledArray(b) == b pc = PooledArray(c) @@ -65,15 +69,50 @@ let a = rand(10), b = rand(10,10), c = rand(1:10, 1000) @test PooledArray{Union{Int,Missing}}([1, 2]) isa PooledArray{Union{Int,Missing}} - @test eltype(PooledArray(rand(128)).refs) == UInt8 - @test eltype(PooledArray(rand(300)).refs) == UInt16 + @test eltype(PooledArray(rand(128)).refs) == UInt32 + @test eltype(PooledArray(rand(300)).refs) == UInt32 + @test eltype(PooledArray(rand(128), UInt8).refs) == UInt8 + @test eltype(PooledArray(rand(300), UInt16).refs) == UInt16 @test PooledVector == PooledArray{T, R, 1} where {T, R} @test PooledMatrix == PooledArray{T, R, 2} where {T, R} s = PooledArray(["a", "a", "b"]) + @test eltype(PooledArray(s).refs) == UInt32 + @test eltype(PooledArray(s, signed=true).refs) == Int32 + @test eltype(PooledArray(s, compress=true).refs) == UInt8 + @test eltype(PooledArray(s, signed=true, compress=true).refs) == Int8 + @test eltype(PooledArray(rand(300), signed=true, compress=true).refs) == Int16 @test all(refarray(s) .== [1, 1, 2]) for i in 1:3 @test refvalue(s, refarray(s)[i]) == s[i] end @test refpool(s) == ["a", "b"] + + @testset "push!" begin + xs = PooledArray([10, 20, 30]) + @test xs === push!(xs, -100) + @test xs == [10, 20, 30, -100] + end + + @testset "pushfirst!" begin + ys = PooledArray([10, 20, 30]) + @test ys === pushfirst!(ys, -100) + @test ys == [-100, 10, 20, 30] + end + + v1 = PooledArray([1, 3, 2, 4]) + v2 = PooledArray(BigInt.([1, 3, 2, 4])) + v3 = PooledArray(["a", "c", "b", "d"]) + + @test PooledArrays.fast_sortable(v1) === v1 + @test isbitstype(eltype(PooledArrays.fast_sortable(v1))) + Base.Order.Perm(Base.Order.Forward, v1).data === v1 + + @test PooledArrays.fast_sortable(v2) == PooledArray([1, 3, 2, 4]) + @test isbitstype(eltype(PooledArrays.fast_sortable(v2))) + Base.Order.Perm(Base.Order.Forward, v2).data == PooledArray([1, 3, 2, 4]) + + @test PooledArrays.fast_sortable(v3) == PooledArray([1, 3, 2, 4]) + @test isbitstype(eltype(PooledArrays.fast_sortable(v3))) + Base.Order.Perm(Base.Order.Forward, v3).data == PooledArray([1, 3, 2, 4]) end