diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e43b0f9 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.DS_Store diff --git a/Manifest.toml b/Manifest.toml index c704ecc..cf7317f 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -1,136 +1,5 @@ # This file is machine-generated - editing it directly is not advised -[[Arpack]] -deps = ["Arpack_jll", "Libdl", "LinearAlgebra"] -git-tree-sha1 = "2ff92b71ba1747c5fdd541f8fc87736d82f40ec9" -uuid = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97" -version = "0.4.0" - -[[Arpack_jll]] -deps = ["Libdl", "OpenBLAS_jll", "Pkg"] -git-tree-sha1 = "68a90a692ddc0eb72d69a6993ca26e2a923bf195" -uuid = "68821587-b530-5797-8361-c406ea357684" -version = "3.5.0+2" - -[[Base64]] -uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" - -[[BinaryProvider]] -deps = ["Libdl", "SHA"] -git-tree-sha1 = "5b08ed6036d9d3f0ee6369410b830f8873d4024c" -uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" -version = "0.5.8" - -[[DataAPI]] -git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252" -uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" -version = "1.1.0" - -[[DataStructures]] -deps = ["InteractiveUtils", "OrderedCollections"] -git-tree-sha1 = "5a431d46abf2ef2a4d5d00bd0ae61f651cf854c8" -uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" -version = "0.17.10" - -[[Dates]] -deps = ["Printf"] -uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" - -[[Distributed]] -deps = ["Random", "Serialization", "Sockets"] -uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" - -[[Distributions]] -deps = ["FillArrays", "LinearAlgebra", "PDMats", "Printf", "QuadGK", "Random", "SpecialFunctions", "Statistics", "StatsBase", "StatsFuns"] -git-tree-sha1 = "6b19601c0e98de3a8964ed33ad73e130c7165b1d" -uuid = "31c24e10-a181-5473-b8eb-7969acd0382f" -version = "0.22.4" - -[[FillArrays]] -deps = ["LinearAlgebra", "Random", "SparseArrays"] -git-tree-sha1 = "85c6b57e2680fa28d5c8adc798967377646fbf66" -uuid = "1a297f60-69ca-5386-bcde-b61e274b549b" -version = "0.8.5" - -[[InteractiveUtils]] -deps = ["Markdown"] -uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" - -[[LibGit2]] -uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" - -[[Libdl]] -uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" - -[[LinearAlgebra]] -deps = ["Libdl"] -uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" - -[[Logging]] -uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" - -[[Markdown]] -deps = ["Base64"] -uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" - -[[Missings]] -deps = ["DataAPI"] -git-tree-sha1 = "de0a5ce9e5289f27df672ffabef4d1e5861247d5" -uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" -version = "0.4.3" - -[[OpenBLAS_jll]] -deps = ["Libdl", "Pkg"] -git-tree-sha1 = "e2551d7c25d52f35b76d86a50917a3ba8988f519" -uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" -version = "0.3.7+5" - -[[OpenSpecFun_jll]] -deps = ["Libdl", "Pkg"] -git-tree-sha1 = "65f672edebf3f4e613ddf37db9dcbd7a407e5e90" -uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" -version = "0.5.3+1" - -[[OrderedCollections]] -deps = ["Random", "Serialization", "Test"] -git-tree-sha1 = "c4c13474d23c60d20a67b217f1d7f22a40edf8f1" -uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" -version = "1.1.0" - -[[PDMats]] -deps = ["Arpack", "LinearAlgebra", "SparseArrays", "SuiteSparse", "Test"] -git-tree-sha1 = "5f303510529486bb02ac4d70da8295da38302194" -uuid = "90014a1f-27ba-587c-ab20-58faa44d9150" -version = "0.9.11" - -[[Pkg]] -deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Test", "UUIDs"] -uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" - -[[Printf]] -deps = ["Unicode"] -uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" - -[[QuadGK]] -deps = ["DataStructures", "LinearAlgebra"] -git-tree-sha1 = "dc84e810393cfc6294248c9032a9cdacc14a3db4" -uuid = "1fd47b50-473d-5c70-9696-f719f8f3bcdc" -version = "2.3.1" - -[[REPL]] -deps = ["InteractiveUtils", "Markdown", "Sockets"] -uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" - -[[Random]] -deps = ["Serialization"] -uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" - -[[Rmath]] -deps = ["BinaryProvider", "Libdl", "Random", "Statistics"] -git-tree-sha1 = "2bbddcb984a1d08612d0c4abb5b4774883f6fa98" -uuid = "79098fc4-a85e-5d69-aa6a-4863f24498fa" -version = "0.6.0" - [[SHA]] uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" @@ -160,12 +29,6 @@ version = "0.10.0" deps = ["LinearAlgebra", "SparseArrays"] uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" -[[StatsBase]] -deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics"] -git-tree-sha1 = "be5c7d45daa449d12868f4466dbf5882242cf2d9" -uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" -version = "0.32.1" - [[StatsFuns]] deps = ["Rmath", "SpecialFunctions"] git-tree-sha1 = "f290ddd5fdedeadd10e961eb3f4d3340f09d030a" @@ -186,3 +49,39 @@ uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" [[Unicode]] uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[Zlib_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "fd36a6739e256527287c5444960d0266712cd49e" +uuid = "83775a58-1f1d-513f-b197-d71354ab007a" +version = "1.2.11+8" + +[[libass_jll]] +deps = ["Bzip2_jll", "FreeType2_jll", "FriBidi_jll", "Libdl", "Pkg", "Zlib_jll"] +git-tree-sha1 = "3fd3ea3525f2e3d337c54a52b2ca78a5a272bbf5" +uuid = "0ac62f75-1d6f-5e53-bd7c-93b484bb37c0" +version = "0.14.0+0" + +[[libfdk_aac_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "0e4ace600c20714a8dd67700c4502714d8473e8e" +uuid = "f638f0a6-7fb0-5443-88ba-1cc74229b280" +version = "0.1.6+1" + +[[libvorbis_jll]] +deps = ["Libdl", "Ogg_jll", "Pkg"] +git-tree-sha1 = "71e54fb89ac3e0344c7185d1876fd96b0f246952" +uuid = "f27f6e37-5d2b-51aa-960f-b287f2bc3b7a" +version = "1.3.6+2" + +[[x264_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "23664c0757c3740050ca0e22944c786c165ca25a" +uuid = "1270edf5-f2f9-52d2-97e9-ab00b5d0237a" +version = "2019.5.25+1" + +[[x265_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "9345e417084421a8e91373d6196bc58e660eed2a" +uuid = "dfaa095f-4041-5dcd-9319-2fabd8486b76" +version = "3.0.0+0" diff --git a/Project.toml b/Project.toml index 65ba1d2..a3f7a69 100644 --- a/Project.toml +++ b/Project.toml @@ -4,9 +4,12 @@ authors = ["Jonathan Chang "] version = "0.1.0" [deps] +Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" +UMAP = "c4f8c510-2410-5be4-91d7-4fbaeb39457e" [compat] julia = "1.3" diff --git a/examples/LDA.jl b/examples/LDA.jl index 16f7183..32027fc 100644 --- a/examples/LDA.jl +++ b/examples/LDA.jl @@ -1,14 +1,46 @@ -using TopicModels +using TopicModels, Plots, UMAP -exdir = Pkg.dir("TopicModels", "examples") +################################################################################################################################## +# Fit and Visualize Real-World Text Data -testDocuments = readDocuments(open(joinpath(exdir, "cora.documents"))) +exdir = joinpath(dirname(pathof(TopicModels)), "..", "examples") + +testDocuments = readDocs(open(joinpath(exdir, "cora.documents"))) testLexicon = readLexicon(open(joinpath(exdir, "cora.lexicon"))) -corpus = Corpus(testDocuments) +corpus = Corpus(testDocuments,testLexicon) +model = Model(fill(0.1, 10), fill(0.01,length(testLexicon)), corpus) +state = State(model,corpus) + +#@time Juno.@run trainModel(model, state, 30) +@time trainModel(model, state, 30) +topWords = topTopicWords(model, state, 10) + +# visualize the fit +@time embedding = umap(state.topics, 2, n_neighbors=10) +maxlabels = vec(map(i->i[1], findmax(state.topics,dims=1)[2])) +scatter(embedding[1,:], embedding[2,:], zcolor=maxlabels, title="UMAP: Max-Likelihood Doc Topics on Learned", marker=(2, 2, :auto, stroke(0))) + +################################################################################################################################## +# Fit, Validate, and Visualize Synthetic Data Derived from a Fully-Generative Simulation (Poisson-distributed document-length) + +k = 10 +lexLength = 1000 +corpLambda = 1000 # poisson parameter for random doc length +corpLength = 100 +scaleK = 0.01 +scaleL = 0.01 +testCorpus = LdaCorpus(k, lexLength, corpLambda, corpLength, scaleK, scaleL) -model = Model(fill(0.1, 10), 0.01, length(testLexicon), corpus) +testModel = Model(testCorpus.alpha, testCorpus.beta, testCorpus) +testState = State(testModel, testCorpus) +@time trainModel(testModel, testState, 100) -@time trainModel(model, 30) +# compute validation metrics on a single fit +CorpusARI(testState,testModel,testCorpus) # ARI for max. likelihood. document topics +DocsARI(testState,testCorpus) # ARI for actual word topics -topWords = topTopicWords(model, testLexicon, 21) +# visualize the fit +@time embedding = umap(testState.topics, 2;n_neighbors=10) +maxlabels = vec(map(i->i[1], findmax(CorpusTopics(testCorpus),dims=1)[2])) +scatter(embedding[1,:], embedding[2,:], zcolor=maxlabels, title="UMAP: True on Learned", marker=(2, 2, :auto, stroke(0))) diff --git a/src/Computation.jl b/src/Computation.jl new file mode 100644 index 0000000..13c8266 --- /dev/null +++ b/src/Computation.jl @@ -0,0 +1,251 @@ +struct Model + alphaPrior::Vector{Float64} # concentration parameter for the symmetric Dirichlet prior on document topics + betaPrior::Vector{Float64} # concentration parameter for the symmetric Dirichlet prior on words + corpus::AbstractCorpus + + # initialize an untrained model + Model(alphaPrior::Vector{Float64}, + betaPrior::Vector{Float64}, + corpus::AbstractCorpus) = begin + K = length(alphaPrior) + m = new( + alphaPrior, + betaPrior, + corpus) + return m + end + + # initialize a trained model + Model(trainedModel::Model, + corpus::AbstractCorpus) = begin + m = new( + trainedModel.alphaPrior, + trainedModel.betaPrior, + corpus + ) + return m + end +end + +struct State + topics::Array{Float64,2} + topicSums::Vector{Float64} + docSums::Array{Float64,2} + assignments::Array{Array{Int64,1},1} + conditionals::Array{Array{Float64,2},1} # the p paramter for the word assignment (cat/multinom) variable + frozen::Bool + + # randomly initialize the state + State(model::Model, + corpus::AbstractCorpus) = begin # length of the lexicon + K = length(model.alphaPrior) + s = new( + zeros(Float64, K, length(corpus.lexicon)), # topics + zeros(Float64, K), # topicSums + zeros(Float64, K, length(corpus.docs)), #docSums + fill(Array{Int64,1}(undef,0), length(corpus.docs)), # assignments + fill(Array{Int64,2}(undef,0,K), length(corpus.docs)), + false + ) + initializeAssignments(model,s,corpus) + return s + end + + # initialize the state from a trained model + State(topics::Array{Float64,2}, + topicSums::Vector{Float64}, + docSums::Array{Float64,2}, + assignments::Array{Array{Int64,1},1}, + conditionals::Array{Array{Float64,2},1}, + frozen::Bool) = begin # length of the lexicon + s = new( + topics, + topicSums, + docSums, + assignmens, + conditionals, + frozen + ) + return s + end +end + +function AllTopics(state::State) + alltopics = [] + for i in 1:length(state.assignments) + append!(alltopics,state.assignments[i]) + end + return convert(Array{Int,1},alltopics) +end + +function initializeAssignments(model::Model,state::State,corpus::AbstractCorpus) + for dd in 1:length(corpus) + @inbounds words = corpus.docs[dd].terms + @inbounds state.assignments[dd] = zeros(length(words)) + @inbounds state.conditionals[dd] = zeros(length(words), length(model.alphaPrior)) + for ww in 1:length(words) + @inbounds word = words[ww] + @inbounds state.conditionals[dd][ww,:] = model.alphaPrior + topic = sampleMultinomial(ww,dd,state) + @inbounds state.assignments[dd][ww] = topic + updateSufficientStatistics(word, topic, dd, + model.corpus.weights[dd][ww], + state) + end + end + return +end + + +function sampleMultinomial(word_ind::Int64, + document::Int64, + state::State) + cond = state.conditionals[document][word_ind,:] + pSum = sum(cond) + r = rand() * pSum + K = length(cond) + for k in 1:K + if r < cond[k] + return k + else + @inbounds r -= cond[k] + end + end + return 0 +end + +function cond_word(word::Int, + word_ind::Int, + document::Int, + model::Model, + state::State) + V = size(state.topics, 2) + for ii in 1:length(model.alphaPrior) + @inbounds state.conditionals[document][word_ind,ii] = + (state.docSums[ii, document] + model.alphaPrior[ii]) * + (state.topics[ii, word] + model.betaPrior[word]) / + (state.topicSums[ii] + V * model.betaPrior[word]) + end + return +end + +function log_beta(x::Vector{Float64}) + # compute natural log of the multivariate beta function + lb = sum(loggamma.(x)) + lb -= loggamma(sum(x)) +end + +function joint_log_p(model::Model, + state::State) + #calculate the full joint log likelihood, this is usefull for testing + log_pz = 0 + for k in 1:length(model.alphaPrior) + @inbounds log_pz += (log_beta(state.topics[k,:] .+ model.betaPrior) - + log_beta(model.betaPrior)) + end + for d in 1:length(model.corpus) + @inbounds log_pz += (log_beta(state.docSums[:,d] .+ model.alphaPrior) - + log_beta(model.alphaPrior)) + end + return log_pz +end + +function sampleWord(word::Int, + word_ind::Int, + document::Int, + model::Model, + state::State) + cond_word(word, word_ind, document, model, state) + sampleMultinomial(word_ind, document, state) +end + + +function updateSufficientStatistics(word::Int64, + topic::Int64, + document::Int64, + scale::Float64, + state::State) + fr = Float64(!state.frozen) + @inbounds state.docSums[topic, document] += scale + @inbounds state.topicSums[topic] += scale * fr + @inbounds state.topics[topic, word] += scale * fr + return +end + +@doc raw""" + getTermDist(state::State, model::Model) + +Compute ``\phi_{k,v} = \frac{\Psi_{k,v} + \beta_t}{\left( \sum^V_{v'=1} \Psi_{k,v'} + \beta_{v'}\right)}`` + +Where ``\vec{ϕ_v}`` parameterizes the V-dimensional categorical distribution of a word. + +Updates the `termDist` attribute of `state` +""" +function getTermDist(state::State, model::Model) + Phi = Array{Float64,2}(undef,length(model.alphaPrior),length(model.betaPrior)) + for topic in 1:length(model.alphaPrior) + Phi[topic,:] = (state.topics[topic,:] .+ model.betaPrior) ./ (state.topicSums[topic] + sum(model.betaPrior)) + end + return Phi +end + +@doc raw""" + getTopicDist(state::State, model::Model) + +Compute ``\theta_{k,m} = \frac{\Omega_{k,m} + \alpha_k}{\left( \sum^K_{k'=1} \Omega_{k,m'} + \alpha_{k'}\right)}`` + +Where ``\vec{\theta_m}`` parameterizes the K-dimensional categorical distribution of a document. + +Updates the `topicDist` attribute of `state` +""" +function getTopicDist(state::State, model::Model) + Theta = Array{Float64,2}(undef,length(model.alphaPrior),length(model.corpus)) + for doc in 1:length(model.corpus) + Theta[:,doc] = (state.docSums[:,doc] .+ model.alphaPrior) ./ (sum(state.docSums[:,doc]) + sum(model.alphaPrior)) + end + return Theta +end + +function sampleDocument(document::Int, + model::Model, + state::State) + words = model.corpus.docs[document].terms + Nw = length(words) + @inbounds weights = model.corpus.weights[document] + K = length(model.alphaPrior) + @inbounds assignments = state.assignments[document] + for ii in 1:Nw + word = words[ii] + oldTopic = assignments[ii] + updateSufficientStatistics(word, oldTopic, document, -weights[ii], state) + newTopic = sampleWord(word, ii, document, model, state) + @inbounds assignments[ii] = newTopic + updateSufficientStatistics(word, newTopic, document, weights[ii], state) + end + return +end + +function sampleCorpus(model::Model, state::State) + for ii in 1:length(model.corpus) + sampleDocument(ii, model, state) + end + return +end + +# The functions below are designed for public consumption +function trainModel(model::Model, + state::State, + numIterations::Int64) + for ii in 1:numIterations + println(string("Iteration ", ii, "...")) + sampleCorpus(model, state) + end + return +end + +function topTopicWords(model::Model, + state::State, + numWords::Int64) + [model.corpus.lexicon[reverse(sortperm(state.topics'[1:end, row]))[1:numWords]] + for row in 1:size(state.topics,1)] +end diff --git a/src/Data.jl b/src/Data.jl new file mode 100644 index 0000000..14c8794 --- /dev/null +++ b/src/Data.jl @@ -0,0 +1,163 @@ +### Document +abstract type AbstractDocument end + +struct LdaDocument <: AbstractDocument + # this is a fully observed data from the LDA model + theta::Array{Float64,1} # the topic probs for the doc + z::Array{Int64,1} # the topic for each word + terms::Array{Int64,1} # the word tokens + + LdaDocument(alpha::Array{Float64,1}, + P::Array{Float64,2}, + N::Int64) = begin # length of the doc + d = new( + Array{Float64,1}(undef,size(P,2)), + Array{Int64,1}(undef,N), + Array{Int64,1}(undef,N) + ) + GenerateDoc(d,alpha,P) + return d + end + + LdaDocument(theta::Array{Float64,1}, + z::Array{Int64,1}, + terms::Array{Int64,1}) = begin + d = new(theta,z,N) + return d + end +end + +function GenerateDoc(doc::LdaDocument, + alpha::Array{Float64,1}, + Phi::Array{Float64,2}) + dd = Dirichlet(alpha) + doc.theta .= vec(rand(dd,1)) + cat = Categorical(vec(doc.theta)) + doc.z .= rand(cat,length(doc)) + for i in 1:length(doc) + @inbounds dc = Categorical(Phi[:,doc.z[i]]) + @inbounds doc.terms[i] = rand(dc,1)[1] + end + return +end + +struct Document <: AbstractDocument + #this is actual data, where only the terms are observed + terms::Array{Int64,1} # the word tokens + Document(terms::Array{Int64,1}) = new(terms) +end + +function length(doc::AbstractDocument) + return size(doc.terms,1) +end + +### Corpus +abstract type AbstractCorpus end + +struct LdaCorpus <: AbstractCorpus + # this is a fully observed data from the LDA model + docs::Array{LdaDocument,1} + alpha::Array{Float64,1} + beta::Array{Float64,1} + Phi::Array{Float64,2} + weights::Array{Array{Float64,1}} # only unweighted terms supported + lexicon::Array{String,1} + + LdaCorpus(k::Int64, + lexLength::Int64, + corpLambda::Int64, + corpLength::Int64, + scaleK::Float64, + scaleL::Float64) = begin # length of the doc + w = Array{Array{Float64,1},1}(undef,corpLength) + lex = string.([1:1:lexLength;]) # there is no + a = fill(scaleK,k) # scale parameter for the Dirichlet topic prior + b = fill(scaleL,lexLength) # scale parameter for the Dirichlet token prior + dl = Poisson(corpLambda) + docLengths = rand(dl,corpLength) # the lengths of the docs in the corpus + db = Dirichlet(b) + P = rand(db,k) # the Dirichlet token prior, containing one lexLength vector for each k + d = Array{LdaDocument,1}(undef,corpLength) + for i in 1:corpLength + w[i] = ones(docLengths[i]) + @inbounds d[i] = LdaDocument(a,P,docLengths[i]) + end + return new(d, a, b, P, w, lex) + end + + LdaCorpus(docs::Array{LdaDocument,1}, # the documents + alpha::Array{Float64,1}, + beta::Array{Float64,1}, + Phi::Array{Float64,2}, + weights::Array{Float64,1}, + lexicon::Array{String,1}) = begin + c = new(docs,alpha,beta,Phi,weights) + return c + end +end + +function CorpusTopics(corpus::LdaCorpus) + cat(dims=2,map(i->vec(i.theta), corpus.docs)...) # get a 2d array of (document wise) mixed membership for the corpus +end + +function AllTopics(corpus::LdaCorpus) + alltopics = [] + for i in 1:length(corpus) + append!(alltopics,corpus.docs[i].z) + end + return convert(Array{Int,1},alltopics) +end + +struct Corpus <: AbstractCorpus + docs::Array{Document,1} + weights::Array{Array{Float64,1},1} + lexicon::Array{String,1} + + Corpus(docs::Array{Document,1}, + weights::Array{Array{Float64,1},1}, + lexicon::Array{String,1}) = begin + return new( + docs, + weights, + lexicon + ) + end + + Corpus(docs::Array{Document,1}, + lexicon::Array{String,1}) = begin + return new( + docs, + map(x -> ones(Float64,length(x)), docs), # no weights + lexicon + ) + end +end + +function length(corpus::AbstractCorpus) + return length(corpus.docs) +end + +# Expand a term:count pair into a -length sequence [term, term, ....] +function termToWordSequence(term::AbstractString) + parts = split(term, ":") + fill(parse(Int64, parts[1]) + 1, parse(Int64, parts[2])) +end + +function readDocs(stream) + corpus = readlines(stream) + docs = Array{Document,1}(undef,length(corpus)) + for i in 1:length(corpus) + @inbounds terms = split(corpus[i], " ")[2:end] + @inbounds docs[i] = Document(termToWordSequence(terms[1])) + for ii in 2:length(terms) + @inbounds append!(docs[i].terms, termToWordSequence(terms[ii])) + end + end + return docs +end + +function readLexicon(stream) + lines = readlines(stream) + chomped = map(chomp, convert(Array{AbstractString,1}, lines)) + convert(Array{String,1},chomped) # convert from substrings +end diff --git a/src/TopicModels.jl b/src/TopicModels.jl index 2990d26..3c6c9e9 100644 --- a/src/TopicModels.jl +++ b/src/TopicModels.jl @@ -1,210 +1,32 @@ module TopicModels +#Imports import Base.length -RaggedMatrix{T} = Array{Array{T,1},1} - -struct Corpus - documents::RaggedMatrix{Int64} - weights::RaggedMatrix{Float64} - - Corpus(documents::RaggedMatrix{Int64}, - weights::RaggedMatrix{Float64}) = begin - return new( - documents, - weights - ) - end - - Corpus(documents::RaggedMatrix{Int64}) = begin - weights = map(documents) do doc - ones(Float64, length(doc)) - end - return new( - documents, - weights - ) - end -end - -struct Model - alphaPrior::Vector{Float64} - betaPrior::Float64 - topics::Array{Float64,2} - topicSums::Vector{Float64} - documentSums::Array{Float64,2} - assignments::RaggedMatrix{Int64} - frozen::Bool - corpus::Corpus - - Model(alphaPrior::Vector{Float64}, - betaPrior::Float64, - V::Int64, - corpus::Corpus) = begin - K = length(alphaPrior) - m = new( - alphaPrior, - betaPrior, - zeros(Float64, K, V), # topics - zeros(Float64, K), # topicSums - zeros(Float64, K, length(corpus.documents)), #documentSums - Array{Array{Int64,1},1}(undef,length(corpus.documents)), # assignments - false, - corpus - ) - initializeAssignments(m) - return m - end - - Model(trainedModel::Model, corpus::Corpus) = begin - m = new( - trainedModel.alphaPrior, - trainedModel.betaPrior, - trainedModel.topics, - trainedModel.topicSums, - trainedModel.documentSums, - fill(Array(Int64, 0), length(corpus.documents)), - true, - corpus - ) - initializeAssignments(m) - return m - end -end - -function length(corpus::Corpus) - return length(corpus.documents) -end - -function initializeAssignments(model::Model) - for dd in 1:length(model.corpus) - @inbounds words = model.corpus.documents[dd] - @inbounds model.assignments[dd] = fill(0, length(words)) - for ww in 1:length(words) - @inbounds word = words[ww] - topic = sampleMultinomial(model.alphaPrior) - @inbounds model.assignments[dd][ww] = topic - updateSufficientStatistics( - word, topic, dd, model.corpus.weights[dd][ww], model) - end - end - return -end - -function sampleMultinomial(p::Array{Float64,1}) - pSum = sum(p) - r = rand() * pSum - K = length(p) - for k in 1:K - if r < p[k] - return k - else - r -= p[k] - end - end - return 0 -end - -function wordDistribution(word::Int, - document::Int, - model::Model, - out::Vector{Float64}) - V = size(model.topics, 2) - for ii in 1:length(out) - u = (model.documentSums[ii, document] + model.alphaPrior[ii]) * - (model.topics[ii, word] + model.betaPrior) / - (model.topicSums[ii] + V * model.betaPrior) - @inbounds out[ii] = u - end - return -end - -function sampleWord(word::Int, - document::Int, - model::Model, - p::Vector{Float64}) - wordDistribution(word, document, model, p) - sampleMultinomial(p) -end - - -function updateSufficientStatistics(word::Int64, - topic::Int64, - document::Int64, - scale::Float64, - model::Model) - fr = Float64(!model.frozen) - @inbounds model.documentSums[topic, document] += scale - @inbounds model.topicSums[topic] += scale * fr - @inbounds model.topics[topic, word] += scale * fr - return -end - -function sampleDocument(document::Int, - model::Model) - @inbounds words = model.corpus.documents[document] - Nw = length(words) - @inbounds weights = model.corpus.weights[document] - K = length(model.alphaPrior) - p = Array{Float64,1}(undef,K) - @inbounds assignments = model.assignments[document] - for ii in 1:Nw - @inbounds word = words[ii] - @inbounds oldTopic = assignments[ii] - updateSufficientStatistics(word, oldTopic, document, -weights[ii], model) - newTopic = sampleWord(word, document, model, p) - @inbounds assignments[ii] = newTopic - updateSufficientStatistics(word, newTopic, document, weights[ii], model) - end - return -end - -function sampleCorpus(model::Model) - for ii in 1:length(model.corpus) - sampleDocument(ii, model) - end - return -end - -# Note, files are zero indexed, but we are 1-indexed. -function termToWordSequence(term::AbstractString) - parts = split(term, ":") - fill(parse(Int64, parts[1]) + 1, parse(Int64, parts[2])) -end - -# The functions below are designed for public consumption -function trainModel(model::Model, - numIterations::Int64) - for ii in 1:numIterations - println(string("Iteration ", ii, "...")) - sampleCorpus(model) - end - return -end - -function topTopicWords(model::Model, - lexicon::Array{String,1}, - numWords::Int64) - [lexicon[reverse(sortperm(model.topics'[1:end, row]))[1:numWords]] - for row in 1:size(model.topics,1)] -end - -function readDocuments(stream) - lines = readlines(stream) - convert(RaggedMatrix{Int64}, - [vcat([termToWordSequence(term) for term in split(line, " ")[2:end]]...) - for line in lines]) -end - -function readLexicon(stream) - lines = readlines(stream) - convert(Array{String,1},map(chomp, convert(Array{AbstractString,1}, lines))) -end +using Random, Distributions, Plots, UMAP +using SpecialFunctions: loggamma +using Clustering: randindex +#Exports export Corpus, + LdaCorpus, Model, - readDocuments, + State, + readDocs, readLexicon, + termToWordSequence, topTopicWords, - trainModel -end + trainModel, + CorpusTopics, + CorpusARI, + DocsARI + +#Data that we make or find in real life: +include("Data.jl") + +#Bayesian learning and inference: +include("Computation.jl") + +#Stuff like perplexity and ARI: +include("Validation.jl") +end #module \ No newline at end of file diff --git a/src/Validation.jl b/src/Validation.jl new file mode 100644 index 0000000..57710e5 --- /dev/null +++ b/src/Validation.jl @@ -0,0 +1,15 @@ +function CorpusARI(state::State,model::Model,corpus::LdaCorpus) + #for synthetic data, turn our mixed membership document vectors into max likelihood assignments + # and check ARI between the ground truth and the state + + learned_max_clust = map(i->i[1], findmax(getTopicDist(state,model),dims=1)[2]) + true_max_clust = map(i->i[1], findmax(CorpusTopics(corpus),dims=1)[2]) + randindex(learned_max_clust,true_max_clust) +end + +function DocsARI(state::State,corpus::LdaCorpus) + #for synthetic data, find the topic ARI across [all terms in] all documents in the corpus + learned_clust = AllTopics(state) + true_clust = AllTopics(corpus) + randindex(learned_clust,true_clust) +end diff --git a/test/Gibbs_unit_tests.jl b/test/Gibbs_unit_tests.jl new file mode 100644 index 0000000..fda7937 --- /dev/null +++ b/test/Gibbs_unit_tests.jl @@ -0,0 +1,54 @@ +using Test, TopicModels, Random +using TopicModels: updateSufficientStatistics, joint_log_p #non-exported fns we need + + +# use the equality of likelihood ratio to test that the conditional distribution is consistent with the joint distribution +@testset "LDA docs" begin + # generate some data from LDA where the doclength is Poisson + k = 7 + lexLength = 10 + corpLambda = 10 # poisson parameter for random doc length + corpLength = 10 + scaleK = 0.1 + scaleL = 0.1 + Random.seed!(123) + + corpus = LdaCorpus(k, lexLength, corpLambda, corpLength, scaleK, scaleL) + + model = Model(corpus.alpha, corpus.beta, corpus) + state = State(model, corpus) + trainModel(model, state, 10) # update all the state variables + + # pick a random doc/word to iterate the sampler + doc_ind = rand(1:corpLength) + word_ind = rand(1:length(corpus.docs[doc_ind])) + word = corpus.docs[doc_ind].terms[word_ind] + + conditional = state.conditionals[doc_ind][word_ind,:] + oldTopic = copy(state.assignments[doc_ind][word_ind]) # the original word token + + newTopic = rand(collect(1:k)[1:end .!= oldTopic],1) # a different word token + newTopic = Int64(newTopic[1]) + + #get the original state probs + joint_Lw = copy(joint_log_p(model,state)) # log prob of the full joint under original topic for + cond_Lw = log(state.conditionals[doc_ind][word_ind,oldTopic]/sum(state.conditionals[doc_ind][word_ind,:])) # log conditional p(z=k|...) + cond_Lw_new = log(state.conditionals[doc_ind][word_ind,newTopic]/sum(state.conditionals[doc_ind][word_ind,:])) # log conditional p(z=k|...) + + updateSufficientStatistics(word, oldTopic, doc_ind, -model.corpus.weights[doc_ind][word_ind], state) #remove counts for the old topic + updateSufficientStatistics(word, newTopic, doc_ind, model.corpus.weights[doc_ind][word_ind], state) #update stats for new topic + joint_Lw_new = copy(joint_log_p(model,state)) # log prob of the full joint under original topic for + + print("joint_Lw: ", joint_Lw, "\n") + print("cond_Lw: ", cond_Lw, "\n") + + print("joint_Lw_new: ", joint_Lw_new, "\n") + print("cond_Lw_new: ", cond_Lw_new, "\n") + + print("joint_LR: ", joint_Lw_new-joint_Lw, "\n") + print("cond_LR: ", cond_Lw_new-cond_Lw, "\n") + print("old Topic: ", oldTopic, "\n") + print("new Topic: ", newTopic, "\n") + + @test isless(abs(joint_Lw_new-joint_Lw - cond_Lw_new+cond_Lw),1e-5) +end