src/chunks.jl

struct Chunks
    ctx::Context
    Chunks(ctx::Context) = new(ctx)
end

"""
    CSV.Chunks(source; ntasks::Integer=Threads.nthreads(), kwargs...) => CSV.Chunks

Returns a file "chunk" iterator. Accepts all the same inputs and keyword arguments as [`CSV.File`](@ref),
see those docs for explanations of each keyword argument.

The `ntasks` keyword argument specifies how many chunks a file should be split up into, defaulting to
the # of threads available to Julia (i.e. `JULIA_NUM_THREADS` environment variable) or 8 if Julia is
run single-threaded.

Each iteration of `CSV.Chunks` produces the next chunk of a file as a `CSV.File`. While initial file
metadata detection is done only once (to determine # of columns, column names, etc), each iteration
does independent type inference on columns. This is significant as different chunks may end up with
different column types than previous chunks as new values are encountered in the file. Note that, as
with `CSV.File`, types may be passed manually via the `type` or `types` keyword arguments.

This functionality is new and thus considered experimental; please
[open an issue](https://github.com/JuliaData/CSV.jl/issues/new) if you run into any problems/bugs.

$KEYWORD_DOCS
"""
function Chunks(source::ValidSources;
    # file options
    # header can be a row number, range of rows, or actual string vector
    header::Union{Integer, Vector{Symbol}, Vector{String}, AbstractVector{<:Integer}}=1,
    normalizenames::Bool=false,
    # by default, data starts immediately after header or start of file
    datarow::Integer=-1,
    skipto::Integer=-1,
    footerskip::Integer=0,
    transpose::Bool=false,
    comment::Union{String, Nothing}=nothing,
    ignoreemptyrows::Bool=true,
    ignoreemptylines=nothing,
    select=nothing,
    drop=nothing,
    limit::Union{Integer, Nothing}=nothing,
    buffer_in_memory::Bool=false,
    ntasks::Union{Integer, Nothing}=Threads.nthreads() == 1 ? 8 : Threads.nthreads(),
    tasks::Union{Nothing, Integer}=nothing,
    rows_to_check::Integer=DEFAULT_ROWS_TO_CHECK,
    lines_to_check=nothing,
    # parsing options
    missingstrings=String[],
    missingstring="",
    delim::Union{Nothing, Char, String}=nothing,
    ignorerepeated::Bool=false,
    quoted::Bool=true,
    quotechar::Union{UInt8, Char}='"',
    openquotechar::Union{UInt8, Char, Nothing}=nothing,
    closequotechar::Union{UInt8, Char, Nothing}=nothing,
    escapechar::Union{UInt8, Char}='"',
    dateformat::Union{String, Dates.DateFormat, Nothing, AbstractDict}=nothing,
    dateformats=nothing,
    decimal::Union{UInt8, Char}=UInt8('.'),
    groupmark::Union{Char, Nothing}=nothing,
    truestrings::Union{Vector{String}, Nothing}=TRUE_STRINGS,
    falsestrings::Union{Vector{String}, Nothing}=FALSE_STRINGS,
    stripwhitespace::Bool=false,
    # type options
    type=nothing,
    types=nothing,
    typemap::AbstractDict=IdDict{Type, Type}(),
    pool::Union{Bool, Real, AbstractVector, AbstractDict, Base.Callable, Tuple}=DEFAULT_POOL,
    downcast::Bool=false,
    lazystrings::Bool=false,
    stringtype::StringTypes=DEFAULT_STRINGTYPE,
    strict::Bool=false,
    silencewarnings::Bool=false,
    maxwarnings::Int=DEFAULT_MAX_WARNINGS,
    debug::Bool=false,
    parsingdebug::Bool=false,
    validate=true,
    )

    ctx = @refargs Context(source, header, normalizenames, datarow, skipto, footerskip, transpose, comment, ignoreemptyrows, ignoreemptylines, select, drop, limit, buffer_in_memory, nothing, ntasks, tasks, rows_to_check, lines_to_check, missingstrings, missingstring, delim, ignorerepeated, quoted, quotechar, openquotechar, closequotechar, escapechar, dateformat, dateformats, decimal, groupmark, truestrings, falsestrings, stripwhitespace, type, types, typemap, pool, downcast, lazystrings, stringtype, strict, silencewarnings, maxwarnings, debug, parsingdebug, validate, false)
    !ctx.threaded && throw(ArgumentError("unable to iterate chunks from input file source"))
    foreach(col -> col.lock = ReentrantLock(), ctx.columns)
    return Chunks(ctx)
end

Base.IteratorSize(::Type{Chunks}) = Base.HasLength()
Base.length(x::Chunks) = x.ctx.ntasks
Base.eltype(x::Chunks) = File

function Base.iterate(x::Chunks, i=1)
    i > x.ctx.ntasks && return nothing
    names = copy(x.ctx.names)
    columns = [Column(col) for col in x.ctx.columns]
    datapos = x.ctx.chunkpositions[i]
    len = x.ctx.chunkpositions[i + 1] - 1
    rowsguess = cld(x.ctx.rowsguess, x.ctx.ntasks)
    threaded = false
    ntasks = 1
    limit = typemax(Int)
    ctx = Context(x.ctx.transpose, x.ctx.name, names, rowsguess, x.ctx.cols, x.ctx.buf, datapos, len, 1, x.ctx.options, columns, x.ctx.pool, x.ctx.downcast, x.ctx.customtypes, x.ctx.typemap, x.ctx.stringtype, limit, threaded, ntasks, x.ctx.chunkpositions, x.ctx.strict, x.ctx.silencewarnings, x.ctx.maxwarnings, x.ctx.debug, x.ctx.tempfile, x.ctx.streaming, x.ctx.types)
    f = File(ctx, true)
    return f, i + 1
end

Tables.partitions(x::Chunks) = x