From 295e3e39dffe80b86f863b778cdd8c267c676a2b Mon Sep 17 00:00:00 2001 From: Fineday Date: Thu, 14 Apr 2016 19:31:17 +0800 Subject: [PATCH 1/5] fix deprecated warning for julia 0.4 --- README.md | 10 ++++++++++ src/TopicModels.jl | 18 ++++++++---------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index a8c984a..2dd09d3 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ document. The space whence the words are drawn is termed the lexicon. Formally, the model is defined as +``` For each topic k, phi_k ~ Dirichlet(beta) For each document d, @@ -21,6 +22,7 @@ Formally, the model is defined as For each word w, z ~ Multinomial(theta) w ~ Multinomial(phi_z) +``` alpha and beta are hyperparameters of the model. The number of topics, K, is a fixed parameter of the model, and w is observed. This package fits @@ -31,8 +33,10 @@ the topics using collapsed Gibbs sampling (Griffiths and Steyvers, 2004). We describe the functions of the package using an example. First we load corpora from data files as follows: +``` testDocuments = readDocuments(open("cora.documents")) testLexicon = readLexicon(open("cora.lexicon")) +``` These read files in LDA-C format. The lexicon file is assumed to have one word per line. The document file consists of one document per line. Each @@ -45,7 +49,9 @@ the number of tuples for that document. With the documents loaded, we instantiate a model that we want to train: +``` model = Model(fill(0.1, 10), 0.01, length(testLexicon), testDocuments) +``` This is a model with 10 topics. alpha is set to a uniform Dirichlet prior with 0.1 weight on each topic (the dimension of this variable is used @@ -54,7 +60,9 @@ the prior weight on phi (i.e. beta) should be set to 0.01. The third parameter is the lexicon size; here we just use the lexicon we have just read. The fourth parameter is the collection of documents. +``` trainModel(testDocuments, model, 30) +``` With the model defined, we can train the model on a corpus of documents. The trainModel command takes the corpus as the first argument, the model @@ -64,7 +72,9 @@ will be mutated in place. Finally we can examine the output of the trained model using topTopicWords. +``` topWords = topTopicWords(model, testLexicon, 10) +``` This function retrieves the top words associated with each topic; this serves as a useful summary of the model. The first parameter is the model, diff --git a/src/TopicModels.jl b/src/TopicModels.jl index d1bd4e2..38696f5 100644 --- a/src/TopicModels.jl +++ b/src/TopicModels.jl @@ -133,7 +133,7 @@ function updateSufficientStatistics(word::Int64, document::Int64, scale::Float64, model::Model) - fr = float64(!model.frozen) + fr = Float64(!model.frozen) @inbounds model.documentSums[topic, document] += scale @inbounds model.topicSums[topic] += scale * fr @inbounds model.topics[topic, word] += scale * fr @@ -167,9 +167,9 @@ function sampleCorpus(model::Model) end # Note, files are zero indexed, but we are 1-indexed. -function termToWordSequence(term::String) +function termToWordSequence(term::AbstractString) parts = split(term, ":") - fill(int64(parts[1]) + 1, int64(parts[2])) + fill(parse(Int64, parts[1]) + 1, parse(Int64, parts[2])) end # The functions below are designed for public consumption @@ -190,16 +190,15 @@ function topTopicWords(model::Model, end function readDocuments(stream) - lines = readlines(stream) - convert( - RaggedMatrix{Int64}, - [apply(vcat, [termToWordSequence(term) for term in split(line, " ")[2:end]]) - for line in lines]) + lines = readlines(stream) + convert(RaggedMatrix{Int64}, + [vcat([termToWordSequence(term) for term in split(line, " ")[2:end]]...) + for line in lines]) end function readLexicon(stream) lines = readlines(stream) - map(chomp, convert(Array{String,1}, lines)) + map(chomp, convert(Array{AbstractString,1}, lines)) end export Corpus, @@ -208,5 +207,4 @@ export Corpus, readLexicon, topTopicWords, trainModel - end From feec08d28d60013d62069eb6122ca1d525cb852a Mon Sep 17 00:00:00 2001 From: Matt Karikomi Date: Sat, 29 Feb 2020 21:37:45 -0800 Subject: [PATCH 2/5] v1.3 compat v1.3 compat fixed lexicon --- .DS_Store | Bin 0 -> 6148 bytes Manifest.toml | 188 +++++++++++++++++++++++++++++++++++++++++++++ Project.toml | 18 +++++ REQUIRE | 0 src/TopicModels.jl | 34 ++++---- 5 files changed, 223 insertions(+), 17 deletions(-) create mode 100644 .DS_Store create mode 100644 Manifest.toml create mode 100644 Project.toml delete mode 100644 REQUIRE diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..8b6cf85eb3dc8b3f099f156b7f1f2012e201d7ab GIT binary patch literal 6148 zcmeHK%}T>S5Z-O8ZYyFAq8@Yc)n$)1NlDE(|@)3L; zXLdJYvEWU_&cNLUhY(d!?ONMe)~MS~ z)0x%ntknX&xqmR5RmIMpb9mCbdw9&AX3sC0T@K_c*)&+dD=4P8dv6krWONH7MRXBG zNDL4I!~ij{WDJ;tK&&rW-Bd0yKn(oM0PYVqG(^{6u2F3r(BbtN{S`zM(D5ySXd845 z<{H5R!gVU3PUYr_!F4)}+a}I6m}}JOjH{Jl9Pi2-6@o`JG9 zEj<5^;g?zX$e&Lkix?mV{uu+j((yWND9W6z-^#`+sePmZT dwckb^;#`BdMw|u5RXQMD1Qa3E5d*)#zz0PoPhS84 literal 0 HcmV?d00001 diff --git a/Manifest.toml b/Manifest.toml new file mode 100644 index 0000000..c704ecc --- /dev/null +++ b/Manifest.toml @@ -0,0 +1,188 @@ +# This file is machine-generated - editing it directly is not advised + +[[Arpack]] +deps = ["Arpack_jll", "Libdl", "LinearAlgebra"] +git-tree-sha1 = "2ff92b71ba1747c5fdd541f8fc87736d82f40ec9" +uuid = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97" +version = "0.4.0" + +[[Arpack_jll]] +deps = ["Libdl", "OpenBLAS_jll", "Pkg"] +git-tree-sha1 = "68a90a692ddc0eb72d69a6993ca26e2a923bf195" +uuid = "68821587-b530-5797-8361-c406ea357684" +version = "3.5.0+2" + +[[Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[BinaryProvider]] +deps = ["Libdl", "SHA"] +git-tree-sha1 = "5b08ed6036d9d3f0ee6369410b830f8873d4024c" +uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" +version = "0.5.8" + +[[DataAPI]] +git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252" +uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" +version = "1.1.0" + +[[DataStructures]] +deps = ["InteractiveUtils", "OrderedCollections"] +git-tree-sha1 = "5a431d46abf2ef2a4d5d00bd0ae61f651cf854c8" +uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" +version = "0.17.10" + +[[Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[Distributed]] +deps = ["Random", "Serialization", "Sockets"] +uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" + +[[Distributions]] +deps = ["FillArrays", "LinearAlgebra", "PDMats", "Printf", "QuadGK", "Random", "SpecialFunctions", "Statistics", "StatsBase", "StatsFuns"] +git-tree-sha1 = "6b19601c0e98de3a8964ed33ad73e130c7165b1d" +uuid = "31c24e10-a181-5473-b8eb-7969acd0382f" +version = "0.22.4" + +[[FillArrays]] +deps = ["LinearAlgebra", "Random", "SparseArrays"] +git-tree-sha1 = "85c6b57e2680fa28d5c8adc798967377646fbf66" +uuid = "1a297f60-69ca-5386-bcde-b61e274b549b" +version = "0.8.5" + +[[InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[LibGit2]] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[LinearAlgebra]] +deps = ["Libdl"] +uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[Missings]] +deps = ["DataAPI"] +git-tree-sha1 = "de0a5ce9e5289f27df672ffabef4d1e5861247d5" +uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" +version = "0.4.3" + +[[OpenBLAS_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "e2551d7c25d52f35b76d86a50917a3ba8988f519" +uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" +version = "0.3.7+5" + +[[OpenSpecFun_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "65f672edebf3f4e613ddf37db9dcbd7a407e5e90" +uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" +version = "0.5.3+1" + +[[OrderedCollections]] +deps = ["Random", "Serialization", "Test"] +git-tree-sha1 = "c4c13474d23c60d20a67b217f1d7f22a40edf8f1" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.1.0" + +[[PDMats]] +deps = ["Arpack", "LinearAlgebra", "SparseArrays", "SuiteSparse", "Test"] +git-tree-sha1 = "5f303510529486bb02ac4d70da8295da38302194" +uuid = "90014a1f-27ba-587c-ab20-58faa44d9150" +version = "0.9.11" + +[[Pkg]] +deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Test", "UUIDs"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[[Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[QuadGK]] +deps = ["DataStructures", "LinearAlgebra"] +git-tree-sha1 = "dc84e810393cfc6294248c9032a9cdacc14a3db4" +uuid = "1fd47b50-473d-5c70-9696-f719f8f3bcdc" +version = "2.3.1" + +[[REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[Random]] +deps = ["Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[Rmath]] +deps = ["BinaryProvider", "Libdl", "Random", "Statistics"] +git-tree-sha1 = "2bbddcb984a1d08612d0c4abb5b4774883f6fa98" +uuid = "79098fc4-a85e-5d69-aa6a-4863f24498fa" +version = "0.6.0" + +[[SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" + +[[Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[SortingAlgorithms]] +deps = ["DataStructures", "Random", "Test"] +git-tree-sha1 = "03f5898c9959f8115e30bc7226ada7d0df554ddd" +uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c" +version = "0.3.1" + +[[SparseArrays]] +deps = ["LinearAlgebra", "Random"] +uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + +[[SpecialFunctions]] +deps = ["OpenSpecFun_jll"] +git-tree-sha1 = "e19b98acb182567bcb7b75bb5d9eedf3a3b5ec6c" +uuid = "276daf66-3868-5448-9aa4-cd146d93841b" +version = "0.10.0" + +[[Statistics]] +deps = ["LinearAlgebra", "SparseArrays"] +uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + +[[StatsBase]] +deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics"] +git-tree-sha1 = "be5c7d45daa449d12868f4466dbf5882242cf2d9" +uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" +version = "0.32.1" + +[[StatsFuns]] +deps = ["Rmath", "SpecialFunctions"] +git-tree-sha1 = "f290ddd5fdedeadd10e961eb3f4d3340f09d030a" +uuid = "4c63d2b9-4356-54db-8cca-17b64c39e42c" +version = "0.9.4" + +[[SuiteSparse]] +deps = ["Libdl", "LinearAlgebra", "Serialization", "SparseArrays"] +uuid = "4607b0f0-06f3-5cda-b6b1-a6196a1729e9" + +[[Test]] +deps = ["Distributed", "InteractiveUtils", "Logging", "Random"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" diff --git a/Project.toml b/Project.toml new file mode 100644 index 0000000..65ba1d2 --- /dev/null +++ b/Project.toml @@ -0,0 +1,18 @@ +name = "TopicModels" +uuid = "e9825ca3-3499-4c9b-97dc-a93734876e50" +authors = ["Jonathan Chang "] +version = "0.1.0" + +[deps] +Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" + +[compat] +julia = "1.3" + +[extras] +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[targets] +test = ["Test"] diff --git a/REQUIRE b/REQUIRE deleted file mode 100644 index e69de29..0000000 diff --git a/src/TopicModels.jl b/src/TopicModels.jl index 38696f5..2990d26 100644 --- a/src/TopicModels.jl +++ b/src/TopicModels.jl @@ -2,9 +2,9 @@ module TopicModels import Base.length -typealias RaggedMatrix{T} Array{Array{T,1},1} +RaggedMatrix{T} = Array{Array{T,1},1} -type Corpus +struct Corpus documents::RaggedMatrix{Int64} weights::RaggedMatrix{Float64} @@ -15,7 +15,7 @@ type Corpus weights ) end - + Corpus(documents::RaggedMatrix{Int64}) = begin weights = map(documents) do doc ones(Float64, length(doc)) @@ -27,7 +27,7 @@ type Corpus end end -type Model +struct Model alphaPrior::Vector{Float64} betaPrior::Float64 topics::Array{Float64,2} @@ -37,9 +37,9 @@ type Model frozen::Bool corpus::Corpus - Model(alphaPrior::Vector{Float64}, - betaPrior::Float64, - V::Int64, + Model(alphaPrior::Vector{Float64}, + betaPrior::Float64, + V::Int64, corpus::Corpus) = begin K = length(alphaPrior) m = new( @@ -48,7 +48,7 @@ type Model zeros(Float64, K, V), # topics zeros(Float64, K), # topicSums zeros(Float64, K, length(corpus.documents)), #documentSums - fill(Array(Int64, 0), length(corpus.documents)), # assignments + Array{Array{Int64,1},1}(undef,length(corpus.documents)), # assignments false, corpus ) @@ -111,8 +111,8 @@ function wordDistribution(word::Int, out::Vector{Float64}) V = size(model.topics, 2) for ii in 1:length(out) - u = (model.documentSums[ii, document] + model.alphaPrior[ii]) * - (model.topics[ii, word] + model.betaPrior) / + u = (model.documentSums[ii, document] + model.alphaPrior[ii]) * + (model.topics[ii, word] + model.betaPrior) / (model.topicSums[ii] + V * model.betaPrior) @inbounds out[ii] = u end @@ -128,10 +128,10 @@ function sampleWord(word::Int, end -function updateSufficientStatistics(word::Int64, +function updateSufficientStatistics(word::Int64, topic::Int64, document::Int64, - scale::Float64, + scale::Float64, model::Model) fr = Float64(!model.frozen) @inbounds model.documentSums[topic, document] += scale @@ -146,7 +146,7 @@ function sampleDocument(document::Int, Nw = length(words) @inbounds weights = model.corpus.weights[document] K = length(model.alphaPrior) - p = Array(Float64, K) + p = Array{Float64,1}(undef,K) @inbounds assignments = model.assignments[document] for ii in 1:Nw @inbounds word = words[ii] @@ -170,10 +170,10 @@ end function termToWordSequence(term::AbstractString) parts = split(term, ":") fill(parse(Int64, parts[1]) + 1, parse(Int64, parts[2])) -end +end # The functions below are designed for public consumption -function trainModel(model::Model, +function trainModel(model::Model, numIterations::Int64) for ii in 1:numIterations println(string("Iteration ", ii, "...")) @@ -183,7 +183,7 @@ function trainModel(model::Model, end function topTopicWords(model::Model, - lexicon::Array{ASCIIString,1}, + lexicon::Array{String,1}, numWords::Int64) [lexicon[reverse(sortperm(model.topics'[1:end, row]))[1:numWords]] for row in 1:size(model.topics,1)] @@ -198,7 +198,7 @@ end function readLexicon(stream) lines = readlines(stream) - map(chomp, convert(Array{AbstractString,1}, lines)) + convert(Array{String,1},map(chomp, convert(Array{AbstractString,1}, lines))) end export Corpus, From 68884a6d8e9f97600dc59f2d20366d5c24715099 Mon Sep 17 00:00:00 2001 From: Matt Karikomi Date: Sun, 1 Mar 2020 18:33:10 -0800 Subject: [PATCH 3/5] Refactoring 1) Type hierarchy for data: rooted at abstract corpus and document, which support subtypes representing fully-synthetic and real world data 2) Type hierarchy for MCMC: break struct model into "model" and "state" reflecting the scope (document locality) of latent variables vs model parameters and hyperpriors. This will facilitate clear cut testing in next PR based on Grosse and Duvenaud https://arxiv.org/abs/1412.5218 --- .DS_Store | Bin 6148 -> 0 bytes .gitignore | 1 + Manifest.toml | 367 +++++++++++++++++++++++++++++++++++++++++++-- Project.toml | 3 + examples/LDA.jl | 41 ++++- src/Computation.jl | 251 +++++++++++++++++++++++++++++++ src/Data.jl | 163 ++++++++++++++++++++ src/TopicModels.jl | 226 +++------------------------- src/Validation.jl | 15 ++ 9 files changed, 844 insertions(+), 223 deletions(-) delete mode 100644 .DS_Store create mode 100644 .gitignore create mode 100644 src/Computation.jl create mode 100644 src/Data.jl create mode 100644 src/Validation.jl diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 8b6cf85eb3dc8b3f099f156b7f1f2012e201d7ab..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK%}T>S5Z-O8ZYyFAq8@Yc)n$)1NlDE(|@)3L; zXLdJYvEWU_&cNLUhY(d!?ONMe)~MS~ z)0x%ntknX&xqmR5RmIMpb9mCbdw9&AX3sC0T@K_c*)&+dD=4P8dv6krWONH7MRXBG zNDL4I!~ij{WDJ;tK&&rW-Bd0yKn(oM0PYVqG(^{6u2F3r(BbtN{S`zM(D5ySXd845 z<{H5R!gVU3PUYr_!F4)}+a}I6m}}JOjH{Jl9Pi2-6@o`JG9 zEj<5^;g?zX$e&Lkix?mV{uu+j((yWND9W6z-^#`+sePmZT dwckb^;#`BdMw|u5RXQMD1Qa3E5d*)#zz0PoPhS84 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e43b0f9 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.DS_Store diff --git a/Manifest.toml b/Manifest.toml index c704ecc..469e496 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -1,5 +1,11 @@ # This file is machine-generated - editing it directly is not advised +[[ArnoldiMethod]] +deps = ["DelimitedFiles", "LinearAlgebra", "Random", "SparseArrays", "StaticArrays", "Test"] +git-tree-sha1 = "2b6845cea546604fb4dca4e31414a6a59d39ddcd" +uuid = "ec485272-7323-5ecc-a04f-4719b315124d" +version = "0.0.4" + [[Arpack]] deps = ["Arpack_jll", "Libdl", "LinearAlgebra"] git-tree-sha1 = "2ff92b71ba1747c5fdd541f8fc87736d82f40ec9" @@ -12,14 +18,62 @@ git-tree-sha1 = "68a90a692ddc0eb72d69a6993ca26e2a923bf195" uuid = "68821587-b530-5797-8361-c406ea357684" version = "3.5.0+2" +[[ArrayInterface]] +deps = ["LinearAlgebra", "Requires", "SparseArrays"] +git-tree-sha1 = "81e5dd1f5374aba2badfe967fc6a132c02ab471a" +uuid = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9" +version = "2.5.0" + [[Base64]] uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" -[[BinaryProvider]] -deps = ["Libdl", "SHA"] -git-tree-sha1 = "5b08ed6036d9d3f0ee6369410b830f8873d4024c" -uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" -version = "0.5.8" +[[Bzip2_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "92463331a641b19fc3baa427e0b76cdbd54dc05d" +uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0" +version = "1.0.6+1" + +[[Clustering]] +deps = ["Distances", "LinearAlgebra", "NearestNeighbors", "Printf", "SparseArrays", "Statistics", "StatsBase"] +git-tree-sha1 = "225b796b1aa8b2e5c9c90bfb1f6779772d08bc00" +uuid = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" +version = "0.13.4" + +[[ColorTypes]] +deps = ["FixedPointNumbers", "Random"] +git-tree-sha1 = "b9de8dc6106e09c79f3f776c27c62360d30e5eb8" +uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f" +version = "0.9.1" + +[[Colors]] +deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Printf", "Reexport"] +git-tree-sha1 = "177d8b959d3c103a6d57574c38ee79c81059c31b" +uuid = "5ae59095-9a9b-59fe-a467-6f913c188581" +version = "0.11.2" + +[[CommonSubexpressions]] +deps = ["Test"] +git-tree-sha1 = "efdaf19ab11c7889334ca247ff4c9f7c322817b0" +uuid = "bbf7d656-a473-5ed7-a52c-81e309532950" +version = "0.2.0" + +[[Compat]] +deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] +git-tree-sha1 = "ed2c4abadf84c53d9e58510b5fc48912c2336fbb" +uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" +version = "2.2.0" + +[[CompilerSupportLibraries_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "b57c5d019367c90f234a7bc7e24ff0a84971da5d" +uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" +version = "0.2.0+1" + +[[Contour]] +deps = ["StaticArrays"] +git-tree-sha1 = "6d56f927b33d3820561b8f89d7de311718683846" +uuid = "d38c429a-6771-53c6-b99e-75d170b6e991" +version = "0.5.2" [[DataAPI]] git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252" @@ -36,6 +90,28 @@ version = "0.17.10" deps = ["Printf"] uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" +[[DelimitedFiles]] +deps = ["Mmap"] +uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" + +[[DiffResults]] +deps = ["StaticArrays"] +git-tree-sha1 = "da24935df8e0c6cf28de340b958f6aac88eaa0cc" +uuid = "163ba53b-c6d8-5494-b064-1a9d43ac40c5" +version = "1.0.2" + +[[DiffRules]] +deps = ["NaNMath", "Random", "SpecialFunctions"] +git-tree-sha1 = "eb0c34204c8410888844ada5359ac8b96292cfd1" +uuid = "b552c78f-8df3-52c6-915a-8e097449b14b" +version = "1.0.1" + +[[Distances]] +deps = ["LinearAlgebra", "Statistics"] +git-tree-sha1 = "23717536c81b63e250f682b0e0933769eecd1411" +uuid = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" +version = "0.8.2" + [[Distributed]] deps = ["Random", "Serialization", "Sockets"] uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" @@ -46,22 +122,105 @@ git-tree-sha1 = "6b19601c0e98de3a8964ed33ad73e130c7165b1d" uuid = "31c24e10-a181-5473-b8eb-7969acd0382f" version = "0.22.4" +[[FFMPEG]] +deps = ["FFMPEG_jll"] +git-tree-sha1 = "c82bef6fc01e30d500f588cd01d29bdd44f1924e" +uuid = "c87230d0-a227-11e9-1b43-d7ebe4e7570a" +version = "0.3.0" + +[[FFMPEG_jll]] +deps = ["Bzip2_jll", "FreeType2_jll", "FriBidi_jll", "LAME_jll", "LibVPX_jll", "Libdl", "Ogg_jll", "OpenSSL_jll", "Opus_jll", "Pkg", "Zlib_jll", "libass_jll", "libfdk_aac_jll", "libvorbis_jll", "x264_jll", "x265_jll"] +git-tree-sha1 = "814bf7865005bee373521cb49cad46182bec53b4" +uuid = "b22a6f82-2f65-5046-a5b2-351ab43fb4e5" +version = "4.1.0+2" + [[FillArrays]] deps = ["LinearAlgebra", "Random", "SparseArrays"] git-tree-sha1 = "85c6b57e2680fa28d5c8adc798967377646fbf66" uuid = "1a297f60-69ca-5386-bcde-b61e274b549b" version = "0.8.5" +[[FiniteDiff]] +deps = ["ArrayInterface", "LinearAlgebra", "Requires", "SparseArrays", "StaticArrays"] +git-tree-sha1 = "aa78e468afa6a0fde472c3dba0782d1ab60b203d" +uuid = "6a86dc24-6348-571c-b903-95158fe2bd41" +version = "2.2.1" + +[[FixedPointNumbers]] +git-tree-sha1 = "4aaea64dd0c30ad79037084f8ca2b94348e65eaa" +uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93" +version = "0.7.1" + +[[ForwardDiff]] +deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "NaNMath", "Random", "SpecialFunctions", "StaticArrays"] +git-tree-sha1 = "88b082d492be6b63f967b6c96b352e25ced1a34c" +uuid = "f6369f11-7733-5829-9624-2563aa707210" +version = "0.10.9" + +[[FreeType2_jll]] +deps = ["Bzip2_jll", "Libdl", "Pkg", "Zlib_jll"] +git-tree-sha1 = "8e290780d75bc0f676548c3bb84c153f83d14bdc" +uuid = "d7e528f0-a631-5988-bf34-fe36492bcfd7" +version = "2.10.1+1" + +[[FriBidi_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "e479537bf8a8f060c546553c14fd0633978dda7e" +uuid = "559328eb-81f9-559d-9380-de523a88c83c" +version = "1.0.5+2" + +[[GR]] +deps = ["Base64", "DelimitedFiles", "LinearAlgebra", "Printf", "Random", "Serialization", "Sockets", "Test", "UUIDs"] +git-tree-sha1 = "41dd1395d4dc559f1c2cb558cba784ef37b561fe" +uuid = "28b8d3ca-fb5f-59d9-8090-bfdbd6d07a71" +version = "0.47.0" + +[[GeometryTypes]] +deps = ["ColorTypes", "FixedPointNumbers", "LinearAlgebra", "StaticArrays"] +git-tree-sha1 = "9d7520999ca80a51f1bf41be2268a9ac0e4f0619" +uuid = "4d00f742-c7ba-57c2-abde-4428a4b178cb" +version = "0.8.1" + +[[Inflate]] +deps = ["Pkg", "Printf", "Random", "Test"] +git-tree-sha1 = "b7ec91c153cf8bff9aff58b39497925d133ef7fd" +uuid = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9" +version = "0.1.1" + [[InteractiveUtils]] deps = ["Markdown"] uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" +[[JSON]] +deps = ["Dates", "Mmap", "Parsers", "Unicode"] +git-tree-sha1 = "b34d7cef7b337321e97d22242c3c2b91f476748e" +uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" +version = "0.21.0" + +[[LAME_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "a46bff84977753fdba8db3c50db1435bb1eb4288" +uuid = "c1c5ebd0-6772-5130-a774-d5fcae4a789d" +version = "3.100.0+0" + [[LibGit2]] uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" +[[LibVPX_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "e3549ca9bf35feb9d9d954f4c6a9032e92f46e7c" +uuid = "dd192d2f-8180-539f-9fb4-cc70b1dcf69a" +version = "1.8.1+1" + [[Libdl]] uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" +[[LightGraphs]] +deps = ["ArnoldiMethod", "DataStructures", "Distributed", "Inflate", "LinearAlgebra", "Random", "SharedArrays", "SimpleTraits", "SparseArrays", "Statistics"] +git-tree-sha1 = "f40c4dbcd957cc3afc8cca0ff26c9f8304def00d" +uuid = "093fc24a-ae57-5d10-9952-331d41423f4d" +version = "1.3.1" + [[LinearAlgebra]] deps = ["Libdl"] uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" @@ -69,27 +228,94 @@ uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" [[Logging]] uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" +[[LsqFit]] +deps = ["Distributions", "LinearAlgebra", "NLSolversBase", "OptimBase", "Random", "StatsBase", "Test"] +git-tree-sha1 = "186c2afbdb3cd52191078cfc6176f7084ed9dfb7" +uuid = "2fda8390-95c7-5789-9bda-21331edee243" +version = "0.8.1" + +[[MacroTools]] +deps = ["DataStructures", "Markdown", "Random"] +git-tree-sha1 = "07ee65e03e28ca88bc9a338a3726ae0c3efaa94b" +uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" +version = "0.5.4" + [[Markdown]] deps = ["Base64"] uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" +[[Measures]] +git-tree-sha1 = "e498ddeee6f9fdb4551ce855a46f54dbd900245f" +uuid = "442fdcdd-2543-5da2-b0f3-8c86c306513e" +version = "0.3.1" + [[Missings]] deps = ["DataAPI"] git-tree-sha1 = "de0a5ce9e5289f27df672ffabef4d1e5861247d5" uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" version = "0.4.3" +[[Mmap]] +uuid = "a63ad114-7e13-5084-954f-fe012c677804" + +[[NLSolversBase]] +deps = ["DiffResults", "Distributed", "FiniteDiff", "ForwardDiff"] +git-tree-sha1 = "7c4e66c47848562003250f28b579c584e55becc0" +uuid = "d41bc354-129a-5804-8e4c-c37616107c6c" +version = "7.6.1" + +[[NaNMath]] +git-tree-sha1 = "928b8ca9b2791081dc71a51c55347c27c618760f" +uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3" +version = "0.3.3" + +[[NearestNeighborDescent]] +deps = ["DataStructures", "Distances", "LightGraphs", "Random", "Reexport", "SparseArrays"] +git-tree-sha1 = "77ac6b8529e22ee61d2322db7739579d18cd6d19" +uuid = "dd2c4c9e-a32f-5b2f-b342-08c2f244fce8" +version = "0.3.0" + +[[NearestNeighbors]] +deps = ["Distances", "StaticArrays"] +git-tree-sha1 = "8bc6180f328f3c0ea2663935db880d34c57d6eae" +uuid = "b8a86587-4115-5ab1-83bc-aa920d37bbce" +version = "0.4.4" + +[[Ogg_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "f4d4d03c562f40652c0baabd3e4cb2e756e157b7" +uuid = "e7412a2a-1a6e-54c0-be00-318e2571c051" +version = "1.3.3+0" + [[OpenBLAS_jll]] deps = ["Libdl", "Pkg"] -git-tree-sha1 = "e2551d7c25d52f35b76d86a50917a3ba8988f519" +git-tree-sha1 = "adc45e596df7007d48bf6829efb1dc64fdec3ddc" uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" -version = "0.3.7+5" +version = "0.3.7+6" -[[OpenSpecFun_jll]] +[[OpenSSL_jll]] deps = ["Libdl", "Pkg"] -git-tree-sha1 = "65f672edebf3f4e613ddf37db9dcbd7a407e5e90" +git-tree-sha1 = "33661eb9d5484220b4367d067f499b30bafc9c12" +uuid = "458c3c95-2e84-50aa-8efc-19380b2a3a95" +version = "1.1.1+1" + +[[OpenSpecFun_jll]] +deps = ["CompilerSupportLibraries_jll", "Libdl", "Pkg"] +git-tree-sha1 = "d110040968b9afe95c6bd9c6233570b0fe8abd22" uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" -version = "0.5.3+1" +version = "0.5.3+2" + +[[OptimBase]] +deps = ["Compat", "NLSolversBase", "Printf", "Reexport", "Test"] +git-tree-sha1 = "92667ab46a66ad502ec3044f65c41ea68b2e0e9c" +uuid = "87e2bd06-a317-5318-96d9-3ecbac512eee" +version = "2.0.0" + +[[Opus_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "c9931bf2fcdb57b48c227395c61ea82603212f7d" +uuid = "91d4177d-7536-5919-b921-800302f37372" +version = "1.3.1+0" [[OrderedCollections]] deps = ["Random", "Serialization", "Test"] @@ -103,10 +329,34 @@ git-tree-sha1 = "5f303510529486bb02ac4d70da8295da38302194" uuid = "90014a1f-27ba-587c-ab20-58faa44d9150" version = "0.9.11" +[[Parsers]] +deps = ["Dates", "Test"] +git-tree-sha1 = "0c16b3179190d3046c073440d94172cfc3bb0553" +uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" +version = "0.3.12" + [[Pkg]] deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Test", "UUIDs"] uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" +[[PlotThemes]] +deps = ["PlotUtils", "Requires", "Statistics"] +git-tree-sha1 = "df772cc7c78862da96af1ee85cd0111c6640e44e" +uuid = "ccf2f8ad-2431-5c83-bf29-c5338b663b6a" +version = "1.0.1" + +[[PlotUtils]] +deps = ["Colors", "Dates", "Printf", "Random", "Reexport"] +git-tree-sha1 = "a146cb72ec962aec81d478de49d1011db06dd754" +uuid = "995b91a9-d308-5afd-9ec6-746e21dbc043" +version = "0.6.3" + +[[Plots]] +deps = ["Base64", "Contour", "Dates", "FFMPEG", "FixedPointNumbers", "GR", "GeometryTypes", "JSON", "LinearAlgebra", "Measures", "NaNMath", "Pkg", "PlotThemes", "PlotUtils", "Printf", "REPL", "Random", "RecipesBase", "Reexport", "Requires", "Showoff", "SparseArrays", "Statistics", "StatsBase", "UUIDs"] +git-tree-sha1 = "51d4d9154e71253abd2a7df2ee0e3d6b8d14f8b1" +uuid = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" +version = "0.29.5" + [[Printf]] deps = ["Unicode"] uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" @@ -125,11 +375,34 @@ uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" deps = ["Serialization"] uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +[[RecipesBase]] +git-tree-sha1 = "b4ed4a7f988ea2340017916f7c9e5d7560b52cae" +uuid = "3cdcf5f2-1ef4-517c-9805-6587b60abb01" +version = "0.8.0" + +[[Reexport]] +deps = ["Pkg"] +git-tree-sha1 = "7b1d07f411bc8ddb7977ec7f377b97b158514fe0" +uuid = "189a3867-3050-52da-a836-e630ba90ab69" +version = "0.2.0" + +[[Requires]] +deps = ["UUIDs"] +git-tree-sha1 = "d37400976e98018ee840e0ca4f9d20baa231dc6b" +uuid = "ae029012-a4dd-5104-9daa-d747884805df" +version = "1.0.1" + [[Rmath]] -deps = ["BinaryProvider", "Libdl", "Random", "Statistics"] -git-tree-sha1 = "2bbddcb984a1d08612d0c4abb5b4774883f6fa98" +deps = ["Random", "Rmath_jll"] +git-tree-sha1 = "86c5647b565873641538d8f812c04e4c9dbeb370" uuid = "79098fc4-a85e-5d69-aa6a-4863f24498fa" -version = "0.6.0" +version = "0.6.1" + +[[Rmath_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "1660f8fefbf5ab9c67560513131d4e933012fc4b" +uuid = "f50d1b31-88e8-58de-be2c-1cc44531875f" +version = "0.2.2+0" [[SHA]] uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" @@ -137,6 +410,22 @@ uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" [[Serialization]] uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" +[[SharedArrays]] +deps = ["Distributed", "Mmap", "Random", "Serialization"] +uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" + +[[Showoff]] +deps = ["Dates"] +git-tree-sha1 = "e032c9df551fb23c9f98ae1064de074111b7bc39" +uuid = "992d4aef-0814-514b-bc4d-f2e9a6c4116f" +version = "0.3.1" + +[[SimpleTraits]] +deps = ["InteractiveUtils", "MacroTools"] +git-tree-sha1 = "2bdf3b6300a9d66fe29ee8bb51ba100c4df9ecbc" +uuid = "699a6c99-e7fa-54fc-8d76-47d257e15c1d" +version = "0.9.1" + [[Sockets]] uuid = "6462fe0b-24de-5631-8697-dd941f90decc" @@ -156,15 +445,21 @@ git-tree-sha1 = "e19b98acb182567bcb7b75bb5d9eedf3a3b5ec6c" uuid = "276daf66-3868-5448-9aa4-cd146d93841b" version = "0.10.0" +[[StaticArrays]] +deps = ["LinearAlgebra", "Random", "Statistics"] +git-tree-sha1 = "5a3bcb6233adabde68ebc97be66e95dcb787424c" +uuid = "90137ffa-7385-5640-81b9-e52037218182" +version = "0.12.1" + [[Statistics]] deps = ["LinearAlgebra", "SparseArrays"] uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" [[StatsBase]] deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics"] -git-tree-sha1 = "be5c7d45daa449d12868f4466dbf5882242cf2d9" +git-tree-sha1 = "19bfcb46245f69ff4013b3df3b977a289852c3a1" uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" -version = "0.32.1" +version = "0.32.2" [[StatsFuns]] deps = ["Rmath", "SpecialFunctions"] @@ -180,9 +475,51 @@ uuid = "4607b0f0-06f3-5cda-b6b1-a6196a1729e9" deps = ["Distributed", "InteractiveUtils", "Logging", "Random"] uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +[[UMAP]] +deps = ["Arpack", "Distances", "LinearAlgebra", "LsqFit", "NearestNeighborDescent", "Random", "SparseArrays"] +git-tree-sha1 = "cfb648199a80ecb46eac88135714e38c23ff860d" +uuid = "c4f8c510-2410-5be4-91d7-4fbaeb39457e" +version = "0.1.5" + [[UUIDs]] deps = ["Random", "SHA"] uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" [[Unicode]] uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[Zlib_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "fd36a6739e256527287c5444960d0266712cd49e" +uuid = "83775a58-1f1d-513f-b197-d71354ab007a" +version = "1.2.11+8" + +[[libass_jll]] +deps = ["Bzip2_jll", "FreeType2_jll", "FriBidi_jll", "Libdl", "Pkg", "Zlib_jll"] +git-tree-sha1 = "3fd3ea3525f2e3d337c54a52b2ca78a5a272bbf5" +uuid = "0ac62f75-1d6f-5e53-bd7c-93b484bb37c0" +version = "0.14.0+0" + +[[libfdk_aac_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "0e4ace600c20714a8dd67700c4502714d8473e8e" +uuid = "f638f0a6-7fb0-5443-88ba-1cc74229b280" +version = "0.1.6+1" + +[[libvorbis_jll]] +deps = ["Libdl", "Ogg_jll", "Pkg"] +git-tree-sha1 = "71e54fb89ac3e0344c7185d1876fd96b0f246952" +uuid = "f27f6e37-5d2b-51aa-960f-b287f2bc3b7a" +version = "1.3.6+2" + +[[x264_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "23664c0757c3740050ca0e22944c786c165ca25a" +uuid = "1270edf5-f2f9-52d2-97e9-ab00b5d0237a" +version = "2019.5.25+1" + +[[x265_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "9345e417084421a8e91373d6196bc58e660eed2a" +uuid = "dfaa095f-4041-5dcd-9319-2fabd8486b76" +version = "3.0.0+0" diff --git a/Project.toml b/Project.toml index 65ba1d2..a3f7a69 100644 --- a/Project.toml +++ b/Project.toml @@ -4,9 +4,12 @@ authors = ["Jonathan Chang "] version = "0.1.0" [deps] +Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" +UMAP = "c4f8c510-2410-5be4-91d7-4fbaeb39457e" [compat] julia = "1.3" diff --git a/examples/LDA.jl b/examples/LDA.jl index 16f7183..9360e0f 100644 --- a/examples/LDA.jl +++ b/examples/LDA.jl @@ -1,14 +1,41 @@ -using TopicModels +using TopicModels, Plots, UMAP -exdir = Pkg.dir("TopicModels", "examples") +exdir = joinpath(dirname(pathof(TopicModels)), "..", "examples") -testDocuments = readDocuments(open(joinpath(exdir, "cora.documents"))) +testDocuments = readDocs(open(joinpath(exdir, "cora.documents"))) testLexicon = readLexicon(open(joinpath(exdir, "cora.lexicon"))) -corpus = Corpus(testDocuments) +corpus = Corpus(testDocuments,testLexicon) +model = Model(fill(0.1, 10), fill(0.01,length(testLexicon)), corpus) +state = State(model,corpus) -model = Model(fill(0.1, 10), 0.01, length(testLexicon), corpus) +#@time Juno.@run trainModel(model, state, 30) +@time trainModel(model, state, 30) +topWords = topTopicWords(model, state, 10) -@time trainModel(model, 30) +embedding = umap(state.topics, 2, n_neighbors=10) +maxlabels = vec(map(i->i[1], findmax(state.topics,dims=1)[2])) +scatter(embedding[1,:], embedding[2,:], zcolor=maxlabels, title="UMAP: Max on Learned", marker=(2, 2, :auto, stroke(0))) -topWords = topTopicWords(model, testLexicon, 21) + +k = 10 +lexLength = 1000 +corpLambda = 1000 # poisson parameter for random doc length +corpLength = 100 +scaleK = 0.01 +scaleL = 0.01 +testCorpus = LdaCorpus(k, lexLength, corpLambda, corpLength, scaleK, scaleL) + +testModel = Model(testCorpus.alpha, testCorpus.beta, testCorpus) +testState = State(testModel, testCorpus) +@time trainModel(testModel, testState, 100) + +# compute validation metrics on a single fit +CorpusARI(testState,testModel,testCorpus) +DocsARI(testState,testCorpus) + +# visualize the fit +@time embedding = umap(testState.topics, 2;n_neighbors=10) + +maxlabels = vec(map(i->i[1], findmax(CorpusTopics(testCorpus),dims=1)[2])) +scatter(embedding[1,:], embedding[2,:], zcolor=maxlabels, title="UMAP: True on Learned", marker=(2, 2, :auto, stroke(0))) diff --git a/src/Computation.jl b/src/Computation.jl new file mode 100644 index 0000000..13c8266 --- /dev/null +++ b/src/Computation.jl @@ -0,0 +1,251 @@ +struct Model + alphaPrior::Vector{Float64} # concentration parameter for the symmetric Dirichlet prior on document topics + betaPrior::Vector{Float64} # concentration parameter for the symmetric Dirichlet prior on words + corpus::AbstractCorpus + + # initialize an untrained model + Model(alphaPrior::Vector{Float64}, + betaPrior::Vector{Float64}, + corpus::AbstractCorpus) = begin + K = length(alphaPrior) + m = new( + alphaPrior, + betaPrior, + corpus) + return m + end + + # initialize a trained model + Model(trainedModel::Model, + corpus::AbstractCorpus) = begin + m = new( + trainedModel.alphaPrior, + trainedModel.betaPrior, + corpus + ) + return m + end +end + +struct State + topics::Array{Float64,2} + topicSums::Vector{Float64} + docSums::Array{Float64,2} + assignments::Array{Array{Int64,1},1} + conditionals::Array{Array{Float64,2},1} # the p paramter for the word assignment (cat/multinom) variable + frozen::Bool + + # randomly initialize the state + State(model::Model, + corpus::AbstractCorpus) = begin # length of the lexicon + K = length(model.alphaPrior) + s = new( + zeros(Float64, K, length(corpus.lexicon)), # topics + zeros(Float64, K), # topicSums + zeros(Float64, K, length(corpus.docs)), #docSums + fill(Array{Int64,1}(undef,0), length(corpus.docs)), # assignments + fill(Array{Int64,2}(undef,0,K), length(corpus.docs)), + false + ) + initializeAssignments(model,s,corpus) + return s + end + + # initialize the state from a trained model + State(topics::Array{Float64,2}, + topicSums::Vector{Float64}, + docSums::Array{Float64,2}, + assignments::Array{Array{Int64,1},1}, + conditionals::Array{Array{Float64,2},1}, + frozen::Bool) = begin # length of the lexicon + s = new( + topics, + topicSums, + docSums, + assignmens, + conditionals, + frozen + ) + return s + end +end + +function AllTopics(state::State) + alltopics = [] + for i in 1:length(state.assignments) + append!(alltopics,state.assignments[i]) + end + return convert(Array{Int,1},alltopics) +end + +function initializeAssignments(model::Model,state::State,corpus::AbstractCorpus) + for dd in 1:length(corpus) + @inbounds words = corpus.docs[dd].terms + @inbounds state.assignments[dd] = zeros(length(words)) + @inbounds state.conditionals[dd] = zeros(length(words), length(model.alphaPrior)) + for ww in 1:length(words) + @inbounds word = words[ww] + @inbounds state.conditionals[dd][ww,:] = model.alphaPrior + topic = sampleMultinomial(ww,dd,state) + @inbounds state.assignments[dd][ww] = topic + updateSufficientStatistics(word, topic, dd, + model.corpus.weights[dd][ww], + state) + end + end + return +end + + +function sampleMultinomial(word_ind::Int64, + document::Int64, + state::State) + cond = state.conditionals[document][word_ind,:] + pSum = sum(cond) + r = rand() * pSum + K = length(cond) + for k in 1:K + if r < cond[k] + return k + else + @inbounds r -= cond[k] + end + end + return 0 +end + +function cond_word(word::Int, + word_ind::Int, + document::Int, + model::Model, + state::State) + V = size(state.topics, 2) + for ii in 1:length(model.alphaPrior) + @inbounds state.conditionals[document][word_ind,ii] = + (state.docSums[ii, document] + model.alphaPrior[ii]) * + (state.topics[ii, word] + model.betaPrior[word]) / + (state.topicSums[ii] + V * model.betaPrior[word]) + end + return +end + +function log_beta(x::Vector{Float64}) + # compute natural log of the multivariate beta function + lb = sum(loggamma.(x)) + lb -= loggamma(sum(x)) +end + +function joint_log_p(model::Model, + state::State) + #calculate the full joint log likelihood, this is usefull for testing + log_pz = 0 + for k in 1:length(model.alphaPrior) + @inbounds log_pz += (log_beta(state.topics[k,:] .+ model.betaPrior) - + log_beta(model.betaPrior)) + end + for d in 1:length(model.corpus) + @inbounds log_pz += (log_beta(state.docSums[:,d] .+ model.alphaPrior) - + log_beta(model.alphaPrior)) + end + return log_pz +end + +function sampleWord(word::Int, + word_ind::Int, + document::Int, + model::Model, + state::State) + cond_word(word, word_ind, document, model, state) + sampleMultinomial(word_ind, document, state) +end + + +function updateSufficientStatistics(word::Int64, + topic::Int64, + document::Int64, + scale::Float64, + state::State) + fr = Float64(!state.frozen) + @inbounds state.docSums[topic, document] += scale + @inbounds state.topicSums[topic] += scale * fr + @inbounds state.topics[topic, word] += scale * fr + return +end + +@doc raw""" + getTermDist(state::State, model::Model) + +Compute ``\phi_{k,v} = \frac{\Psi_{k,v} + \beta_t}{\left( \sum^V_{v'=1} \Psi_{k,v'} + \beta_{v'}\right)}`` + +Where ``\vec{ϕ_v}`` parameterizes the V-dimensional categorical distribution of a word. + +Updates the `termDist` attribute of `state` +""" +function getTermDist(state::State, model::Model) + Phi = Array{Float64,2}(undef,length(model.alphaPrior),length(model.betaPrior)) + for topic in 1:length(model.alphaPrior) + Phi[topic,:] = (state.topics[topic,:] .+ model.betaPrior) ./ (state.topicSums[topic] + sum(model.betaPrior)) + end + return Phi +end + +@doc raw""" + getTopicDist(state::State, model::Model) + +Compute ``\theta_{k,m} = \frac{\Omega_{k,m} + \alpha_k}{\left( \sum^K_{k'=1} \Omega_{k,m'} + \alpha_{k'}\right)}`` + +Where ``\vec{\theta_m}`` parameterizes the K-dimensional categorical distribution of a document. + +Updates the `topicDist` attribute of `state` +""" +function getTopicDist(state::State, model::Model) + Theta = Array{Float64,2}(undef,length(model.alphaPrior),length(model.corpus)) + for doc in 1:length(model.corpus) + Theta[:,doc] = (state.docSums[:,doc] .+ model.alphaPrior) ./ (sum(state.docSums[:,doc]) + sum(model.alphaPrior)) + end + return Theta +end + +function sampleDocument(document::Int, + model::Model, + state::State) + words = model.corpus.docs[document].terms + Nw = length(words) + @inbounds weights = model.corpus.weights[document] + K = length(model.alphaPrior) + @inbounds assignments = state.assignments[document] + for ii in 1:Nw + word = words[ii] + oldTopic = assignments[ii] + updateSufficientStatistics(word, oldTopic, document, -weights[ii], state) + newTopic = sampleWord(word, ii, document, model, state) + @inbounds assignments[ii] = newTopic + updateSufficientStatistics(word, newTopic, document, weights[ii], state) + end + return +end + +function sampleCorpus(model::Model, state::State) + for ii in 1:length(model.corpus) + sampleDocument(ii, model, state) + end + return +end + +# The functions below are designed for public consumption +function trainModel(model::Model, + state::State, + numIterations::Int64) + for ii in 1:numIterations + println(string("Iteration ", ii, "...")) + sampleCorpus(model, state) + end + return +end + +function topTopicWords(model::Model, + state::State, + numWords::Int64) + [model.corpus.lexicon[reverse(sortperm(state.topics'[1:end, row]))[1:numWords]] + for row in 1:size(state.topics,1)] +end diff --git a/src/Data.jl b/src/Data.jl new file mode 100644 index 0000000..fb5379a --- /dev/null +++ b/src/Data.jl @@ -0,0 +1,163 @@ +### Document +abstract type AbstractDocument end + +mutable struct LdaDocument <: AbstractDocument + # this is a fully observed data from the LDA model + theta::Array{Float64,1} # the topic probs for the doc + z::Array{Int64,1} # the topic for each word + terms::Array{Int64,1} # the word tokens + + LdaDocument(alpha::Array{Float64,1}, + P::Array{Float64,2}, + N::Int64) = begin # length of the doc + d = new( + Array{Float64,1}(undef,size(P,2)), + Array{Int64,1}(undef,N), + Array{Int64,1}(undef,N) + ) + GenerateDoc(d,alpha,P) + return d + end + + LdaDocument(theta::Array{Float64,1}, + z::Array{Int64,1}, + terms::Array{Int64,1}) = begin + d = new(theta,z,N) + return d + end +end + +function GenerateDoc(doc::LdaDocument, + alpha::Array{Float64,1}, + Phi::Array{Float64,2}) + dd = Dirichlet(alpha) + doc.theta = vec(rand(dd,1)) + cat = Categorical(vec(doc.theta)) + doc.z = rand(cat,length(doc)) + for i in 1:length(doc) + @inbounds dc = Categorical(Phi[:,doc.z[i]]) + @inbounds doc.terms[i] = rand(dc,1)[1] + end + return +end + +mutable struct Document <: AbstractDocument + #this is actual data, where only the terms are observed + terms::Array{Int64,1} # the word tokens + Document(terms::Array{Int64,1}) = new(terms) +end + +function length(doc::AbstractDocument) + return size(doc.terms,1) +end + +### Corpus +abstract type AbstractCorpus end + +mutable struct LdaCorpus <: AbstractCorpus + # this is a fully observed data from the LDA model + docs::Array{LdaDocument,1} + alpha::Array{Float64,1} + beta::Array{Float64,1} + Phi::Array{Float64,2} + weights::Array{Array{Float64,1}} # only unweighted terms supported + lexicon::Array{String,1} + + LdaCorpus(k::Int64, + lexLength::Int64, + corpLambda::Int64, + corpLength::Int64, + scaleK::Float64, + scaleL::Float64) = begin # length of the doc + w = Array{Array{Float64,1},1}(undef,corpLength) + lex = string.([1:1:lexLength;]) # there is no + a = fill(scaleK,k) # scale parameter for the Dirichlet topic prior + b = fill(scaleL,lexLength) # scale parameter for the Dirichlet token prior + dl = Poisson(corpLambda) + docLengths = rand(dl,corpLength) # the lengths of the docs in the corpus + db = Dirichlet(b) + P = rand(db,k) # the Dirichlet token prior, containing one lexLength vector for each k + d = Array{LdaDocument,1}(undef,corpLength) + for i in 1:corpLength + w[i] = ones(docLengths[i]) + @inbounds d[i] = LdaDocument(a,P,docLengths[i]) + end + return new(d, a, b, P, w, lex) + end + + LdaCorpus(docs::Array{LdaDocument,1}, # the documents + alpha::Array{Float64,1}, + beta::Array{Float64,1}, + Phi::Array{Float64,2}, + weights::Array{Float64,1}, + lexicon::Array{String,1}) = begin + c = new(docs,alpha,beta,Phi,weights) + return c + end +end + +function CorpusTopics(corpus::LdaCorpus) + cat(dims=2,map(i->vec(i.theta), corpus.docs)...) # get a 2d array of (document wise) mixed membership for the corpus +end + +function AllTopics(corpus::LdaCorpus) + alltopics = [] + for i in 1:length(corpus) + append!(alltopics,corpus.docs[i].z) + end + return convert(Array{Int,1},alltopics) +end + +struct Corpus <: AbstractCorpus + docs::Array{Document,1} + weights::Array{Array{Float64,1},1} + lexicon::Array{String,1} + + Corpus(docs::Array{Document,1}, + weights::Array{Array{Float64,1},1}, + lexicon::Array{String,1}) = begin + return new( + docs, + weights, + lexicon + ) + end + + Corpus(docs::Array{Document,1}, + lexicon::Array{String,1}) = begin + return new( + docs, + map(x -> ones(Float64,length(x)), docs), # no weights + lexicon + ) + end +end + +function length(corpus::AbstractCorpus) + return length(corpus.docs) +end + +# Expand a term:count pair into a -length sequence [term, term, ....] +function termToWordSequence(term::AbstractString) + parts = split(term, ":") + fill(parse(Int64, parts[1]) + 1, parse(Int64, parts[2])) +end + +function readDocs(stream) + corpus = readlines(stream) + docs = Array{Document,1}(undef,length(corpus)) + for i in 1:length(corpus) + @inbounds terms = split(corpus[i], " ")[2:end] + @inbounds docs[i] = Document(termToWordSequence(terms[1])) + for ii in 2:length(terms) + @inbounds append!(docs[i].terms, termToWordSequence(terms[ii])) + end + end + return docs +end + +function readLexicon(stream) + lines = readlines(stream) + chomped = map(chomp, convert(Array{AbstractString,1}, lines)) + convert(Array{String,1},chomped) # convert from substrings +end diff --git a/src/TopicModels.jl b/src/TopicModels.jl index 2990d26..056937f 100644 --- a/src/TopicModels.jl +++ b/src/TopicModels.jl @@ -1,210 +1,34 @@ module TopicModels +#Imports import Base.length -RaggedMatrix{T} = Array{Array{T,1},1} - -struct Corpus - documents::RaggedMatrix{Int64} - weights::RaggedMatrix{Float64} - - Corpus(documents::RaggedMatrix{Int64}, - weights::RaggedMatrix{Float64}) = begin - return new( - documents, - weights - ) - end - - Corpus(documents::RaggedMatrix{Int64}) = begin - weights = map(documents) do doc - ones(Float64, length(doc)) - end - return new( - documents, - weights - ) - end -end - -struct Model - alphaPrior::Vector{Float64} - betaPrior::Float64 - topics::Array{Float64,2} - topicSums::Vector{Float64} - documentSums::Array{Float64,2} - assignments::RaggedMatrix{Int64} - frozen::Bool - corpus::Corpus - - Model(alphaPrior::Vector{Float64}, - betaPrior::Float64, - V::Int64, - corpus::Corpus) = begin - K = length(alphaPrior) - m = new( - alphaPrior, - betaPrior, - zeros(Float64, K, V), # topics - zeros(Float64, K), # topicSums - zeros(Float64, K, length(corpus.documents)), #documentSums - Array{Array{Int64,1},1}(undef,length(corpus.documents)), # assignments - false, - corpus - ) - initializeAssignments(m) - return m - end - - Model(trainedModel::Model, corpus::Corpus) = begin - m = new( - trainedModel.alphaPrior, - trainedModel.betaPrior, - trainedModel.topics, - trainedModel.topicSums, - trainedModel.documentSums, - fill(Array(Int64, 0), length(corpus.documents)), - true, - corpus - ) - initializeAssignments(m) - return m - end -end - -function length(corpus::Corpus) - return length(corpus.documents) -end - -function initializeAssignments(model::Model) - for dd in 1:length(model.corpus) - @inbounds words = model.corpus.documents[dd] - @inbounds model.assignments[dd] = fill(0, length(words)) - for ww in 1:length(words) - @inbounds word = words[ww] - topic = sampleMultinomial(model.alphaPrior) - @inbounds model.assignments[dd][ww] = topic - updateSufficientStatistics( - word, topic, dd, model.corpus.weights[dd][ww], model) - end - end - return -end - -function sampleMultinomial(p::Array{Float64,1}) - pSum = sum(p) - r = rand() * pSum - K = length(p) - for k in 1:K - if r < p[k] - return k - else - r -= p[k] - end - end - return 0 -end - -function wordDistribution(word::Int, - document::Int, - model::Model, - out::Vector{Float64}) - V = size(model.topics, 2) - for ii in 1:length(out) - u = (model.documentSums[ii, document] + model.alphaPrior[ii]) * - (model.topics[ii, word] + model.betaPrior) / - (model.topicSums[ii] + V * model.betaPrior) - @inbounds out[ii] = u - end - return -end - -function sampleWord(word::Int, - document::Int, - model::Model, - p::Vector{Float64}) - wordDistribution(word, document, model, p) - sampleMultinomial(p) -end - - -function updateSufficientStatistics(word::Int64, - topic::Int64, - document::Int64, - scale::Float64, - model::Model) - fr = Float64(!model.frozen) - @inbounds model.documentSums[topic, document] += scale - @inbounds model.topicSums[topic] += scale * fr - @inbounds model.topics[topic, word] += scale * fr - return -end - -function sampleDocument(document::Int, - model::Model) - @inbounds words = model.corpus.documents[document] - Nw = length(words) - @inbounds weights = model.corpus.weights[document] - K = length(model.alphaPrior) - p = Array{Float64,1}(undef,K) - @inbounds assignments = model.assignments[document] - for ii in 1:Nw - @inbounds word = words[ii] - @inbounds oldTopic = assignments[ii] - updateSufficientStatistics(word, oldTopic, document, -weights[ii], model) - newTopic = sampleWord(word, document, model, p) - @inbounds assignments[ii] = newTopic - updateSufficientStatistics(word, newTopic, document, weights[ii], model) - end - return -end - -function sampleCorpus(model::Model) - for ii in 1:length(model.corpus) - sampleDocument(ii, model) - end - return -end - -# Note, files are zero indexed, but we are 1-indexed. -function termToWordSequence(term::AbstractString) - parts = split(term, ":") - fill(parse(Int64, parts[1]) + 1, parse(Int64, parts[2])) -end - -# The functions below are designed for public consumption -function trainModel(model::Model, - numIterations::Int64) - for ii in 1:numIterations - println(string("Iteration ", ii, "...")) - sampleCorpus(model) - end - return -end - -function topTopicWords(model::Model, - lexicon::Array{String,1}, - numWords::Int64) - [lexicon[reverse(sortperm(model.topics'[1:end, row]))[1:numWords]] - for row in 1:size(model.topics,1)] -end - -function readDocuments(stream) - lines = readlines(stream) - convert(RaggedMatrix{Int64}, - [vcat([termToWordSequence(term) for term in split(line, " ")[2:end]]...) - for line in lines]) -end - -function readLexicon(stream) - lines = readlines(stream) - convert(Array{String,1},map(chomp, convert(Array{AbstractString,1}, lines))) -end +using Random, Distributions, Plots, UMAP +using SpecialFunctions: loggamma +using Clustering: randindex +#Exports export Corpus, + LdaCorpus, Model, - readDocuments, + State, + readDocs, readLexicon, + termToWordSequence, topTopicWords, - trainModel -end + trainModel, + GenerateDoc, + CorpusTopics, + CorpusARI, + DocsARI, + sampleDocument + +#Data that we make or find in real life: +include("Data.jl") + +#Bayesian learning and inference: +include("Computation.jl") + +#Stuff like perplexity and ARI: +include("Validation.jl") +end #module diff --git a/src/Validation.jl b/src/Validation.jl new file mode 100644 index 0000000..57710e5 --- /dev/null +++ b/src/Validation.jl @@ -0,0 +1,15 @@ +function CorpusARI(state::State,model::Model,corpus::LdaCorpus) + #for synthetic data, turn our mixed membership document vectors into max likelihood assignments + # and check ARI between the ground truth and the state + + learned_max_clust = map(i->i[1], findmax(getTopicDist(state,model),dims=1)[2]) + true_max_clust = map(i->i[1], findmax(CorpusTopics(corpus),dims=1)[2]) + randindex(learned_max_clust,true_max_clust) +end + +function DocsARI(state::State,corpus::LdaCorpus) + #for synthetic data, find the topic ARI across [all terms in] all documents in the corpus + learned_clust = AllTopics(state) + true_clust = AllTopics(corpus) + randindex(learned_clust,true_clust) +end From 827bb48f6d43cf591591c94f02e8aeed30367a40 Mon Sep 17 00:00:00 2001 From: Matt Karikomi Date: Fri, 6 Mar 2020 16:54:13 -0800 Subject: [PATCH 4/5] Add unit test for Gibbs sampler, etc 1) Per-word topics: add a test for consistency (with the full joint) of the corresponding conditional 2) Get rid of mutability on structs in src/Data.jl in favor of in-place assignment --- examples/LDA.jl | 15 +++++++---- src/Data.jl | 10 ++++---- src/TopicModels.jl | 4 +-- test/Gibbs_unit_tests.jl | 55 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 71 insertions(+), 13 deletions(-) create mode 100644 test/Gibbs_unit_tests.jl diff --git a/examples/LDA.jl b/examples/LDA.jl index 9360e0f..32027fc 100644 --- a/examples/LDA.jl +++ b/examples/LDA.jl @@ -1,5 +1,8 @@ using TopicModels, Plots, UMAP +################################################################################################################################## +# Fit and Visualize Real-World Text Data + exdir = joinpath(dirname(pathof(TopicModels)), "..", "examples") testDocuments = readDocs(open(joinpath(exdir, "cora.documents"))) @@ -13,10 +16,13 @@ state = State(model,corpus) @time trainModel(model, state, 30) topWords = topTopicWords(model, state, 10) -embedding = umap(state.topics, 2, n_neighbors=10) +# visualize the fit +@time embedding = umap(state.topics, 2, n_neighbors=10) maxlabels = vec(map(i->i[1], findmax(state.topics,dims=1)[2])) -scatter(embedding[1,:], embedding[2,:], zcolor=maxlabels, title="UMAP: Max on Learned", marker=(2, 2, :auto, stroke(0))) +scatter(embedding[1,:], embedding[2,:], zcolor=maxlabels, title="UMAP: Max-Likelihood Doc Topics on Learned", marker=(2, 2, :auto, stroke(0))) +################################################################################################################################## +# Fit, Validate, and Visualize Synthetic Data Derived from a Fully-Generative Simulation (Poisson-distributed document-length) k = 10 lexLength = 1000 @@ -31,11 +37,10 @@ testState = State(testModel, testCorpus) @time trainModel(testModel, testState, 100) # compute validation metrics on a single fit -CorpusARI(testState,testModel,testCorpus) -DocsARI(testState,testCorpus) +CorpusARI(testState,testModel,testCorpus) # ARI for max. likelihood. document topics +DocsARI(testState,testCorpus) # ARI for actual word topics # visualize the fit @time embedding = umap(testState.topics, 2;n_neighbors=10) - maxlabels = vec(map(i->i[1], findmax(CorpusTopics(testCorpus),dims=1)[2])) scatter(embedding[1,:], embedding[2,:], zcolor=maxlabels, title="UMAP: True on Learned", marker=(2, 2, :auto, stroke(0))) diff --git a/src/Data.jl b/src/Data.jl index fb5379a..14c8794 100644 --- a/src/Data.jl +++ b/src/Data.jl @@ -1,7 +1,7 @@ ### Document abstract type AbstractDocument end -mutable struct LdaDocument <: AbstractDocument +struct LdaDocument <: AbstractDocument # this is a fully observed data from the LDA model theta::Array{Float64,1} # the topic probs for the doc z::Array{Int64,1} # the topic for each word @@ -31,9 +31,9 @@ function GenerateDoc(doc::LdaDocument, alpha::Array{Float64,1}, Phi::Array{Float64,2}) dd = Dirichlet(alpha) - doc.theta = vec(rand(dd,1)) + doc.theta .= vec(rand(dd,1)) cat = Categorical(vec(doc.theta)) - doc.z = rand(cat,length(doc)) + doc.z .= rand(cat,length(doc)) for i in 1:length(doc) @inbounds dc = Categorical(Phi[:,doc.z[i]]) @inbounds doc.terms[i] = rand(dc,1)[1] @@ -41,7 +41,7 @@ function GenerateDoc(doc::LdaDocument, return end -mutable struct Document <: AbstractDocument +struct Document <: AbstractDocument #this is actual data, where only the terms are observed terms::Array{Int64,1} # the word tokens Document(terms::Array{Int64,1}) = new(terms) @@ -54,7 +54,7 @@ end ### Corpus abstract type AbstractCorpus end -mutable struct LdaCorpus <: AbstractCorpus +struct LdaCorpus <: AbstractCorpus # this is a fully observed data from the LDA model docs::Array{LdaDocument,1} alpha::Array{Float64,1} diff --git a/src/TopicModels.jl b/src/TopicModels.jl index 056937f..f7508fa 100644 --- a/src/TopicModels.jl +++ b/src/TopicModels.jl @@ -17,11 +17,9 @@ export Corpus, termToWordSequence, topTopicWords, trainModel, - GenerateDoc, CorpusTopics, CorpusARI, - DocsARI, - sampleDocument + DocsARI #Data that we make or find in real life: include("Data.jl") diff --git a/test/Gibbs_unit_tests.jl b/test/Gibbs_unit_tests.jl new file mode 100644 index 0000000..3998c47 --- /dev/null +++ b/test/Gibbs_unit_tests.jl @@ -0,0 +1,55 @@ +using Test, TopicModels, Random +using TopicModels: updateSufficientStatistics, joint_log_p #non-exported fns we need + + +# use the equality of likelihood ratio to test that the conditional distribution is consistent with the joint distribution +@testset "LDA docs" begin + # generate some data from LDA where the doclength is Poisson + k = 7 + lexLength = 10 + corpLambda = 10 # poisson parameter for random doc length + corpLength = 10 + scaleK = 0.1 + scaleL = 0.1 + Random.seed!(123) + + corpus = LdaCorpus(k, lexLength, corpLambda, corpLength, scaleK, scaleL) + + model = Model(corpus.alpha, corpus.beta, corpus) + state = State(model, corpus) + trainModel(model, state, 10) # update all the state variables + + # pick a random doc/word to iterate the sampler + doc_ind = rand(1:corpLength) + word_ind = rand(1:length(corpus.docs[doc_ind])) + word = corpus.docs[doc_ind].terms[word_ind] + + conditional = state.conditionals[doc_ind][word_ind,:] + oldTopic = copy(state.assignments[doc_ind][word_ind]) # the original word token + + newTopic = rand(collect(1:k)[1:end .!= oldTopic],1) # a different word token + newTopic = Int64(newTopic[1]) + + #get the original state probs + joint_Lw = copy(joint_log_p(model,state)) # log prob of the full joint under original topic for + cond_Lw = log(state.conditionals[doc_ind][word_ind,oldTopic]/sum(state.conditionals[doc_ind][word_ind,:])) # log conditional p(z=k|...) + cond_Lw_new = log(state.conditionals[doc_ind][word_ind,newTopic]/sum(state.conditionals[doc_ind][word_ind,:])) # log conditional p(z=k|...) + + updateSufficientStatistics(word, oldTopic, doc_ind, -model.corpus.weights[doc_ind][word_ind], state) #remove counts for the old topic + updateSufficientStatistics(word, newTopic, doc_ind, model.corpus.weights[doc_ind][word_ind], state) #update stats for new topic + joint_Lw_new = copy(joint_log_p(model,state)) # log prob of the full joint under original topic for + + print("joint_Lw: ", joint_Lw, "\n") + print("cond_Lw: ", cond_Lw, "\n") + + print("joint_Lw_new: ", joint_Lw_new, "\n") + print("cond_Lw_new: ", cond_Lw_new, "\n") + + print("joint_LR: ", joint_Lw_new-joint_Lw, "\n") + print("cond_LR: ", cond_Lw_new-cond_Lw, "\n") + print("old Topic: ", oldTopic, "\n") + print("new Topic: ", newTopic, "\n") + + # why is this happenning? normalizers should cancel for both ratios so our propto sampling dist should work + @test isless(abs(joint_Lw_new-joint_Lw - cond_Lw_new+cond_Lw),1e-5) +end From 16cc7903e9997ca5ec084987f346d0a6da82700b Mon Sep 17 00:00:00 2001 From: Matt Karikomi Date: Sun, 8 Mar 2020 11:48:27 -0700 Subject: [PATCH 5/5] comments in gibbs tests --- test/Gibbs_unit_tests.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/test/Gibbs_unit_tests.jl b/test/Gibbs_unit_tests.jl index 3998c47..fda7937 100644 --- a/test/Gibbs_unit_tests.jl +++ b/test/Gibbs_unit_tests.jl @@ -50,6 +50,5 @@ using TopicModels: updateSufficientStatistics, joint_log_p #non-exported fns we print("old Topic: ", oldTopic, "\n") print("new Topic: ", newTopic, "\n") - # why is this happenning? normalizers should cancel for both ratios so our propto sampling dist should work @test isless(abs(joint_Lw_new-joint_Lw - cond_Lw_new+cond_Lw),1e-5) end