From 295e3e39dffe80b86f863b778cdd8c267c676a2b Mon Sep 17 00:00:00 2001
From: Fineday <xiucheng90@gmail.com>
Date: Thu, 14 Apr 2016 19:31:17 +0800
Subject: [PATCH 1/5] fix deprecated warning for julia 0.4

---
 README.md          | 10 ++++++++++
 src/TopicModels.jl | 18 ++++++++----------
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index a8c984a..2dd09d3 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,7 @@ document.  The space whence the words are drawn is termed the lexicon.
 
 Formally, the model is defined as
 
+```
   For each topic k,
     phi_k ~ Dirichlet(beta)
   For each document d,
@@ -21,6 +22,7 @@ Formally, the model is defined as
     For each word w,
       z ~ Multinomial(theta)
       w ~ Multinomial(phi_z)
+```
 
 alpha and beta are hyperparameters of the model.  The number of topics, K,
 is a fixed parameter of the model, and w is observed.  This package fits 
@@ -31,8 +33,10 @@ the topics using collapsed Gibbs sampling (Griffiths and Steyvers, 2004).
 We describe the functions of the package using an example. First we load 
 corpora from data files as follows:
 
+```
   testDocuments = readDocuments(open("cora.documents"))
   testLexicon = readLexicon(open("cora.lexicon"))
+```
 
 These read files in LDA-C format.  The lexicon file is assumed to have one
 word per line.  The document file consists of one document per line.  Each
@@ -45,7 +49,9 @@ the number of tuples for that document.
 
 With the documents loaded, we instantiate a model that we want to train:
 
+```
   model = Model(fill(0.1, 10), 0.01, length(testLexicon), testDocuments)
+```
 
 This is a model with 10 topics.  alpha is set to a uniform Dirichlet prior
 with 0.1 weight on each topic (the dimension of this variable is used
@@ -54,7 +60,9 @@ the prior weight on phi (i.e. beta) should be set to 0.01.  The third
 parameter is the lexicon size; here we just use the lexicon we have 
 just read.  The fourth parameter is the collection of documents.
 
+```
   trainModel(testDocuments, model, 30)
+```
 
 With the model defined, we can train the model on a corpus of documents.
 The trainModel command takes the corpus as the first argument, the model
@@ -64,7 +72,9 @@ will be mutated in place.
 
 Finally we can examine the output of the trained model using topTopicWords.
 
+```
   topWords = topTopicWords(model, testLexicon, 10)
+```
 
 This function retrieves the top words associated with each topic; this
 serves as a useful summary of the model.  The first parameter is the model,
diff --git a/src/TopicModels.jl b/src/TopicModels.jl
index d1bd4e2..38696f5 100644
--- a/src/TopicModels.jl
+++ b/src/TopicModels.jl
@@ -133,7 +133,7 @@ function updateSufficientStatistics(word::Int64,
                                     document::Int64,
                                     scale::Float64, 
                                     model::Model)
-  fr = float64(!model.frozen)
+  fr = Float64(!model.frozen)
   @inbounds model.documentSums[topic, document] += scale
   @inbounds model.topicSums[topic] += scale * fr
   @inbounds model.topics[topic, word] += scale * fr
@@ -167,9 +167,9 @@ function sampleCorpus(model::Model)
 end
 
 # Note, files are zero indexed, but we are 1-indexed.
-function termToWordSequence(term::String)
+function termToWordSequence(term::AbstractString)
   parts = split(term, ":")
-  fill(int64(parts[1]) + 1, int64(parts[2]))
+  fill(parse(Int64, parts[1]) + 1, parse(Int64, parts[2]))
 end 
 
 # The functions below are designed for public consumption
@@ -190,16 +190,15 @@ function topTopicWords(model::Model,
 end
 
 function readDocuments(stream)
-  lines = readlines(stream)
-  convert(
-    RaggedMatrix{Int64},
-    [apply(vcat, [termToWordSequence(term) for term in split(line, " ")[2:end]])
-     for line in lines])
+    lines = readlines(stream)
+    convert(RaggedMatrix{Int64},
+            [vcat([termToWordSequence(term) for term in split(line, " ")[2:end]]...)
+             for line in lines])
 end
 
 function readLexicon(stream)
   lines = readlines(stream)
-  map(chomp, convert(Array{String,1}, lines))
+  map(chomp, convert(Array{AbstractString,1}, lines))
 end
 
 export Corpus,
@@ -208,5 +207,4 @@ export Corpus,
        readLexicon,
        topTopicWords,
        trainModel
-
 end

From feec08d28d60013d62069eb6122ca1d525cb852a Mon Sep 17 00:00:00 2001
From: Matt Karikomi <au@a2.local>
Date: Sat, 29 Feb 2020 21:37:45 -0800
Subject: [PATCH 2/5] v1.3 compat

v1.3 compat

fixed lexicon
---
 .DS_Store          | Bin 0 -> 6148 bytes
 Manifest.toml      | 188 +++++++++++++++++++++++++++++++++++++++++++++
 Project.toml       |  18 +++++
 REQUIRE            |   0
 src/TopicModels.jl |  34 ++++----
 5 files changed, 223 insertions(+), 17 deletions(-)
 create mode 100644 .DS_Store
 create mode 100644 Manifest.toml
 create mode 100644 Project.toml
 delete mode 100644 REQUIRE

diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..8b6cf85eb3dc8b3f099f156b7f1f2012e201d7ab
GIT binary patch
literal 6148
zcmeHK%}T>S5Z-O8ZYyFAq8@Yc)<c8Eo`g{A!J81#gG!s&Vgof>n$)1NlDE(|@)3L;
zXLdJYvEWU_&cN<Bvoo`s`5^nl7~}0}-(k#Rj0tFn9F+xv!wX#{6O71lj(8l5WB}G%
zFgCuw4*2b57PASv3DU3MKMLcd(Rk;ja%E+8O^BMPZPc#4p-jERPo{C(pIoAK>L<g%
z=x0)nK6O7h4+m-2+B%X^;)l^-qLRb@079-V!l*CPwwy-&Sk2>UhY(d!?ONMe)~MS~
z)0x%ntknX&xqmR5RmIMpb9mCbdw9&AX3sC0T@K_c*)&+dD=4P8dv6krWONH7MRXBG
zNDL4I!~ij{WDJ;tK&&rW-Bd0yKn(oM0PYVqG(^{6u2F3r(BbtN{S`zM(D5ySXd845
z<{H5R!gVU3PUYr_!F4)}+a}I6m}}JOjH{Jl9<y@sc;RYw7`Ih8<E}>Pi2-6@o`JG9
zEj<5^;g?zX$e&Lkix?mV{uu+j((yWND9W6z-^#<Y)`r%DhJtZ9Dj=ZGT>`+sePmZT
dwckb^;#`BdMw|u5RXQMD1Qa3E5d*)#zz0PoPhS84

literal 0
HcmV?d00001

diff --git a/Manifest.toml b/Manifest.toml
new file mode 100644
index 0000000..c704ecc
--- /dev/null
+++ b/Manifest.toml
@@ -0,0 +1,188 @@
+# This file is machine-generated - editing it directly is not advised
+
+[[Arpack]]
+deps = ["Arpack_jll", "Libdl", "LinearAlgebra"]
+git-tree-sha1 = "2ff92b71ba1747c5fdd541f8fc87736d82f40ec9"
+uuid = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97"
+version = "0.4.0"
+
+[[Arpack_jll]]
+deps = ["Libdl", "OpenBLAS_jll", "Pkg"]
+git-tree-sha1 = "68a90a692ddc0eb72d69a6993ca26e2a923bf195"
+uuid = "68821587-b530-5797-8361-c406ea357684"
+version = "3.5.0+2"
+
+[[Base64]]
+uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
+
+[[BinaryProvider]]
+deps = ["Libdl", "SHA"]
+git-tree-sha1 = "5b08ed6036d9d3f0ee6369410b830f8873d4024c"
+uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
+version = "0.5.8"
+
+[[DataAPI]]
+git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252"
+uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
+version = "1.1.0"
+
+[[DataStructures]]
+deps = ["InteractiveUtils", "OrderedCollections"]
+git-tree-sha1 = "5a431d46abf2ef2a4d5d00bd0ae61f651cf854c8"
+uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
+version = "0.17.10"
+
+[[Dates]]
+deps = ["Printf"]
+uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
+
+[[Distributed]]
+deps = ["Random", "Serialization", "Sockets"]
+uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
+
+[[Distributions]]
+deps = ["FillArrays", "LinearAlgebra", "PDMats", "Printf", "QuadGK", "Random", "SpecialFunctions", "Statistics", "StatsBase", "StatsFuns"]
+git-tree-sha1 = "6b19601c0e98de3a8964ed33ad73e130c7165b1d"
+uuid = "31c24e10-a181-5473-b8eb-7969acd0382f"
+version = "0.22.4"
+
+[[FillArrays]]
+deps = ["LinearAlgebra", "Random", "SparseArrays"]
+git-tree-sha1 = "85c6b57e2680fa28d5c8adc798967377646fbf66"
+uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
+version = "0.8.5"
+
+[[InteractiveUtils]]
+deps = ["Markdown"]
+uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
+
+[[LibGit2]]
+uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
+
+[[Libdl]]
+uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
+
+[[LinearAlgebra]]
+deps = ["Libdl"]
+uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+
+[[Logging]]
+uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
+
+[[Markdown]]
+deps = ["Base64"]
+uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
+
+[[Missings]]
+deps = ["DataAPI"]
+git-tree-sha1 = "de0a5ce9e5289f27df672ffabef4d1e5861247d5"
+uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
+version = "0.4.3"
+
+[[OpenBLAS_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "e2551d7c25d52f35b76d86a50917a3ba8988f519"
+uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
+version = "0.3.7+5"
+
+[[OpenSpecFun_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "65f672edebf3f4e613ddf37db9dcbd7a407e5e90"
+uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
+version = "0.5.3+1"
+
+[[OrderedCollections]]
+deps = ["Random", "Serialization", "Test"]
+git-tree-sha1 = "c4c13474d23c60d20a67b217f1d7f22a40edf8f1"
+uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
+version = "1.1.0"
+
+[[PDMats]]
+deps = ["Arpack", "LinearAlgebra", "SparseArrays", "SuiteSparse", "Test"]
+git-tree-sha1 = "5f303510529486bb02ac4d70da8295da38302194"
+uuid = "90014a1f-27ba-587c-ab20-58faa44d9150"
+version = "0.9.11"
+
+[[Pkg]]
+deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Test", "UUIDs"]
+uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
+
+[[Printf]]
+deps = ["Unicode"]
+uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+
+[[QuadGK]]
+deps = ["DataStructures", "LinearAlgebra"]
+git-tree-sha1 = "dc84e810393cfc6294248c9032a9cdacc14a3db4"
+uuid = "1fd47b50-473d-5c70-9696-f719f8f3bcdc"
+version = "2.3.1"
+
+[[REPL]]
+deps = ["InteractiveUtils", "Markdown", "Sockets"]
+uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
+
+[[Random]]
+deps = ["Serialization"]
+uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+
+[[Rmath]]
+deps = ["BinaryProvider", "Libdl", "Random", "Statistics"]
+git-tree-sha1 = "2bbddcb984a1d08612d0c4abb5b4774883f6fa98"
+uuid = "79098fc4-a85e-5d69-aa6a-4863f24498fa"
+version = "0.6.0"
+
+[[SHA]]
+uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
+
+[[Serialization]]
+uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
+
+[[Sockets]]
+uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
+
+[[SortingAlgorithms]]
+deps = ["DataStructures", "Random", "Test"]
+git-tree-sha1 = "03f5898c9959f8115e30bc7226ada7d0df554ddd"
+uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c"
+version = "0.3.1"
+
+[[SparseArrays]]
+deps = ["LinearAlgebra", "Random"]
+uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+
+[[SpecialFunctions]]
+deps = ["OpenSpecFun_jll"]
+git-tree-sha1 = "e19b98acb182567bcb7b75bb5d9eedf3a3b5ec6c"
+uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
+version = "0.10.0"
+
+[[Statistics]]
+deps = ["LinearAlgebra", "SparseArrays"]
+uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+
+[[StatsBase]]
+deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics"]
+git-tree-sha1 = "be5c7d45daa449d12868f4466dbf5882242cf2d9"
+uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
+version = "0.32.1"
+
+[[StatsFuns]]
+deps = ["Rmath", "SpecialFunctions"]
+git-tree-sha1 = "f290ddd5fdedeadd10e961eb3f4d3340f09d030a"
+uuid = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
+version = "0.9.4"
+
+[[SuiteSparse]]
+deps = ["Libdl", "LinearAlgebra", "Serialization", "SparseArrays"]
+uuid = "4607b0f0-06f3-5cda-b6b1-a6196a1729e9"
+
+[[Test]]
+deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
+uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[[UUIDs]]
+deps = ["Random", "SHA"]
+uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
+
+[[Unicode]]
+uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
diff --git a/Project.toml b/Project.toml
new file mode 100644
index 0000000..65ba1d2
--- /dev/null
+++ b/Project.toml
@@ -0,0 +1,18 @@
+name = "TopicModels"
+uuid = "e9825ca3-3499-4c9b-97dc-a93734876e50"
+authors = ["Jonathan Chang <slycoder @gmail.com>"]
+version = "0.1.0"
+
+[deps]
+Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
+
+[compat]
+julia = "1.3"
+
+[extras]
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[targets]
+test = ["Test"]
diff --git a/REQUIRE b/REQUIRE
deleted file mode 100644
index e69de29..0000000
diff --git a/src/TopicModels.jl b/src/TopicModels.jl
index 38696f5..2990d26 100644
--- a/src/TopicModels.jl
+++ b/src/TopicModels.jl
@@ -2,9 +2,9 @@ module TopicModels
 
 import Base.length
 
-typealias RaggedMatrix{T} Array{Array{T,1},1}
+RaggedMatrix{T} = Array{Array{T,1},1}
 
-type Corpus
+struct Corpus
   documents::RaggedMatrix{Int64}
   weights::RaggedMatrix{Float64}
 
@@ -15,7 +15,7 @@ type Corpus
       weights
     )
   end
-  
+
   Corpus(documents::RaggedMatrix{Int64}) = begin
     weights = map(documents) do doc
       ones(Float64, length(doc))
@@ -27,7 +27,7 @@ type Corpus
   end
 end
 
-type Model
+struct Model
   alphaPrior::Vector{Float64}
   betaPrior::Float64
   topics::Array{Float64,2}
@@ -37,9 +37,9 @@ type Model
   frozen::Bool
   corpus::Corpus
 
-  Model(alphaPrior::Vector{Float64}, 
-        betaPrior::Float64, 
-        V::Int64, 
+  Model(alphaPrior::Vector{Float64},
+        betaPrior::Float64,
+        V::Int64,
         corpus::Corpus) = begin
     K = length(alphaPrior)
     m = new(
@@ -48,7 +48,7 @@ type Model
       zeros(Float64, K, V), # topics
       zeros(Float64, K), # topicSums
       zeros(Float64, K, length(corpus.documents)), #documentSums
-      fill(Array(Int64, 0), length(corpus.documents)), # assignments
+      Array{Array{Int64,1},1}(undef,length(corpus.documents)), # assignments
       false,
       corpus
     )
@@ -111,8 +111,8 @@ function wordDistribution(word::Int,
                           out::Vector{Float64})
   V = size(model.topics, 2)
   for ii in 1:length(out)
-    u = (model.documentSums[ii, document] + model.alphaPrior[ii]) * 
-        (model.topics[ii, word] + model.betaPrior) / 
+    u = (model.documentSums[ii, document] + model.alphaPrior[ii]) *
+        (model.topics[ii, word] + model.betaPrior) /
         (model.topicSums[ii] + V * model.betaPrior)
     @inbounds out[ii] = u
   end
@@ -128,10 +128,10 @@ function sampleWord(word::Int,
 end
 
 
-function updateSufficientStatistics(word::Int64, 
+function updateSufficientStatistics(word::Int64,
                                     topic::Int64,
                                     document::Int64,
-                                    scale::Float64, 
+                                    scale::Float64,
                                     model::Model)
   fr = Float64(!model.frozen)
   @inbounds model.documentSums[topic, document] += scale
@@ -146,7 +146,7 @@ function sampleDocument(document::Int,
   Nw = length(words)
   @inbounds weights = model.corpus.weights[document]
   K = length(model.alphaPrior)
-  p = Array(Float64, K)
+  p = Array{Float64,1}(undef,K)
   @inbounds assignments = model.assignments[document]
   for ii in 1:Nw
     @inbounds word = words[ii]
@@ -170,10 +170,10 @@ end
 function termToWordSequence(term::AbstractString)
   parts = split(term, ":")
   fill(parse(Int64, parts[1]) + 1, parse(Int64, parts[2]))
-end 
+end
 
 # The functions below are designed for public consumption
-function trainModel(model::Model, 
+function trainModel(model::Model,
                     numIterations::Int64)
   for ii in 1:numIterations
     println(string("Iteration ", ii, "..."))
@@ -183,7 +183,7 @@ function trainModel(model::Model,
 end
 
 function topTopicWords(model::Model,
-                       lexicon::Array{ASCIIString,1},
+                       lexicon::Array{String,1},
                        numWords::Int64)
   [lexicon[reverse(sortperm(model.topics'[1:end, row]))[1:numWords]]
    for row in 1:size(model.topics,1)]
@@ -198,7 +198,7 @@ end
 
 function readLexicon(stream)
   lines = readlines(stream)
-  map(chomp, convert(Array{AbstractString,1}, lines))
+  convert(Array{String,1},map(chomp, convert(Array{AbstractString,1}, lines)))
 end
 
 export Corpus,

From 68884a6d8e9f97600dc59f2d20366d5c24715099 Mon Sep 17 00:00:00 2001
From: Matt Karikomi <mattkarikomi@gmail.com>
Date: Sun, 1 Mar 2020 18:33:10 -0800
Subject: [PATCH 3/5] Refactoring 1) Type hierarchy for data: rooted at
 abstract corpus and document, which support subtypes representing
 fully-synthetic and real world data 2) Type hierarchy for MCMC: break struct
 model into "model" and "state" reflecting the scope (document locality) of
 latent variables vs model parameters and hyperpriors.  This will facilitate
 clear cut testing in next PR based on Grosse and Duvenaud
 https://arxiv.org/abs/1412.5218

---
 .DS_Store          | Bin 6148 -> 0 bytes
 .gitignore         |   1 +
 Manifest.toml      | 367 +++++++++++++++++++++++++++++++++++++++++++--
 Project.toml       |   3 +
 examples/LDA.jl    |  41 ++++-
 src/Computation.jl | 251 +++++++++++++++++++++++++++++++
 src/Data.jl        | 163 ++++++++++++++++++++
 src/TopicModels.jl | 226 +++-------------------------
 src/Validation.jl  |  15 ++
 9 files changed, 844 insertions(+), 223 deletions(-)
 delete mode 100644 .DS_Store
 create mode 100644 .gitignore
 create mode 100644 src/Computation.jl
 create mode 100644 src/Data.jl
 create mode 100644 src/Validation.jl

diff --git a/.DS_Store b/.DS_Store
deleted file mode 100644
index 8b6cf85eb3dc8b3f099f156b7f1f2012e201d7ab..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6148
zcmeHK%}T>S5Z-O8ZYyFAq8@Yc)<c8Eo`g{A!J81#gG!s&Vgof>n$)1NlDE(|@)3L;
zXLdJYvEWU_&cN<Bvoo`s`5^nl7~}0}-(k#Rj0tFn9F+xv!wX#{6O71lj(8l5WB}G%
zFgCuw4*2b57PASv3DU3MKMLcd(Rk;ja%E+8O^BMPZPc#4p-jERPo{C(pIoAK>L<g%
z=x0)nK6O7h4+m-2+B%X^;)l^-qLRb@079-V!l*CPwwy-&Sk2>UhY(d!?ONMe)~MS~
z)0x%ntknX&xqmR5RmIMpb9mCbdw9&AX3sC0T@K_c*)&+dD=4P8dv6krWONH7MRXBG
zNDL4I!~ij{WDJ;tK&&rW-Bd0yKn(oM0PYVqG(^{6u2F3r(BbtN{S`zM(D5ySXd845
z<{H5R!gVU3PUYr_!F4)}+a}I6m}}JOjH{Jl9<y@sc;RYw7`Ih8<E}>Pi2-6@o`JG9
zEj<5^;g?zX$e&Lkix?mV{uu+j((yWND9W6z-^#<Y)`r%DhJtZ9Dj=ZGT>`+sePmZT
dwckb^;#`BdMw|u5RXQMD1Qa3E5d*)#zz0PoPhS84

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..e43b0f9
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+.DS_Store
diff --git a/Manifest.toml b/Manifest.toml
index c704ecc..469e496 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -1,5 +1,11 @@
 # This file is machine-generated - editing it directly is not advised
 
+[[ArnoldiMethod]]
+deps = ["DelimitedFiles", "LinearAlgebra", "Random", "SparseArrays", "StaticArrays", "Test"]
+git-tree-sha1 = "2b6845cea546604fb4dca4e31414a6a59d39ddcd"
+uuid = "ec485272-7323-5ecc-a04f-4719b315124d"
+version = "0.0.4"
+
 [[Arpack]]
 deps = ["Arpack_jll", "Libdl", "LinearAlgebra"]
 git-tree-sha1 = "2ff92b71ba1747c5fdd541f8fc87736d82f40ec9"
@@ -12,14 +18,62 @@ git-tree-sha1 = "68a90a692ddc0eb72d69a6993ca26e2a923bf195"
 uuid = "68821587-b530-5797-8361-c406ea357684"
 version = "3.5.0+2"
 
+[[ArrayInterface]]
+deps = ["LinearAlgebra", "Requires", "SparseArrays"]
+git-tree-sha1 = "81e5dd1f5374aba2badfe967fc6a132c02ab471a"
+uuid = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
+version = "2.5.0"
+
 [[Base64]]
 uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
 
-[[BinaryProvider]]
-deps = ["Libdl", "SHA"]
-git-tree-sha1 = "5b08ed6036d9d3f0ee6369410b830f8873d4024c"
-uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
-version = "0.5.8"
+[[Bzip2_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "92463331a641b19fc3baa427e0b76cdbd54dc05d"
+uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0"
+version = "1.0.6+1"
+
+[[Clustering]]
+deps = ["Distances", "LinearAlgebra", "NearestNeighbors", "Printf", "SparseArrays", "Statistics", "StatsBase"]
+git-tree-sha1 = "225b796b1aa8b2e5c9c90bfb1f6779772d08bc00"
+uuid = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5"
+version = "0.13.4"
+
+[[ColorTypes]]
+deps = ["FixedPointNumbers", "Random"]
+git-tree-sha1 = "b9de8dc6106e09c79f3f776c27c62360d30e5eb8"
+uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
+version = "0.9.1"
+
+[[Colors]]
+deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Printf", "Reexport"]
+git-tree-sha1 = "177d8b959d3c103a6d57574c38ee79c81059c31b"
+uuid = "5ae59095-9a9b-59fe-a467-6f913c188581"
+version = "0.11.2"
+
+[[CommonSubexpressions]]
+deps = ["Test"]
+git-tree-sha1 = "efdaf19ab11c7889334ca247ff4c9f7c322817b0"
+uuid = "bbf7d656-a473-5ed7-a52c-81e309532950"
+version = "0.2.0"
+
+[[Compat]]
+deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
+git-tree-sha1 = "ed2c4abadf84c53d9e58510b5fc48912c2336fbb"
+uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
+version = "2.2.0"
+
+[[CompilerSupportLibraries_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "b57c5d019367c90f234a7bc7e24ff0a84971da5d"
+uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
+version = "0.2.0+1"
+
+[[Contour]]
+deps = ["StaticArrays"]
+git-tree-sha1 = "6d56f927b33d3820561b8f89d7de311718683846"
+uuid = "d38c429a-6771-53c6-b99e-75d170b6e991"
+version = "0.5.2"
 
 [[DataAPI]]
 git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252"
@@ -36,6 +90,28 @@ version = "0.17.10"
 deps = ["Printf"]
 uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
 
+[[DelimitedFiles]]
+deps = ["Mmap"]
+uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
+
+[[DiffResults]]
+deps = ["StaticArrays"]
+git-tree-sha1 = "da24935df8e0c6cf28de340b958f6aac88eaa0cc"
+uuid = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
+version = "1.0.2"
+
+[[DiffRules]]
+deps = ["NaNMath", "Random", "SpecialFunctions"]
+git-tree-sha1 = "eb0c34204c8410888844ada5359ac8b96292cfd1"
+uuid = "b552c78f-8df3-52c6-915a-8e097449b14b"
+version = "1.0.1"
+
+[[Distances]]
+deps = ["LinearAlgebra", "Statistics"]
+git-tree-sha1 = "23717536c81b63e250f682b0e0933769eecd1411"
+uuid = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
+version = "0.8.2"
+
 [[Distributed]]
 deps = ["Random", "Serialization", "Sockets"]
 uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
@@ -46,22 +122,105 @@ git-tree-sha1 = "6b19601c0e98de3a8964ed33ad73e130c7165b1d"
 uuid = "31c24e10-a181-5473-b8eb-7969acd0382f"
 version = "0.22.4"
 
+[[FFMPEG]]
+deps = ["FFMPEG_jll"]
+git-tree-sha1 = "c82bef6fc01e30d500f588cd01d29bdd44f1924e"
+uuid = "c87230d0-a227-11e9-1b43-d7ebe4e7570a"
+version = "0.3.0"
+
+[[FFMPEG_jll]]
+deps = ["Bzip2_jll", "FreeType2_jll", "FriBidi_jll", "LAME_jll", "LibVPX_jll", "Libdl", "Ogg_jll", "OpenSSL_jll", "Opus_jll", "Pkg", "Zlib_jll", "libass_jll", "libfdk_aac_jll", "libvorbis_jll", "x264_jll", "x265_jll"]
+git-tree-sha1 = "814bf7865005bee373521cb49cad46182bec53b4"
+uuid = "b22a6f82-2f65-5046-a5b2-351ab43fb4e5"
+version = "4.1.0+2"
+
 [[FillArrays]]
 deps = ["LinearAlgebra", "Random", "SparseArrays"]
 git-tree-sha1 = "85c6b57e2680fa28d5c8adc798967377646fbf66"
 uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
 version = "0.8.5"
 
+[[FiniteDiff]]
+deps = ["ArrayInterface", "LinearAlgebra", "Requires", "SparseArrays", "StaticArrays"]
+git-tree-sha1 = "aa78e468afa6a0fde472c3dba0782d1ab60b203d"
+uuid = "6a86dc24-6348-571c-b903-95158fe2bd41"
+version = "2.2.1"
+
+[[FixedPointNumbers]]
+git-tree-sha1 = "4aaea64dd0c30ad79037084f8ca2b94348e65eaa"
+uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
+version = "0.7.1"
+
+[[ForwardDiff]]
+deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "NaNMath", "Random", "SpecialFunctions", "StaticArrays"]
+git-tree-sha1 = "88b082d492be6b63f967b6c96b352e25ced1a34c"
+uuid = "f6369f11-7733-5829-9624-2563aa707210"
+version = "0.10.9"
+
+[[FreeType2_jll]]
+deps = ["Bzip2_jll", "Libdl", "Pkg", "Zlib_jll"]
+git-tree-sha1 = "8e290780d75bc0f676548c3bb84c153f83d14bdc"
+uuid = "d7e528f0-a631-5988-bf34-fe36492bcfd7"
+version = "2.10.1+1"
+
+[[FriBidi_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "e479537bf8a8f060c546553c14fd0633978dda7e"
+uuid = "559328eb-81f9-559d-9380-de523a88c83c"
+version = "1.0.5+2"
+
+[[GR]]
+deps = ["Base64", "DelimitedFiles", "LinearAlgebra", "Printf", "Random", "Serialization", "Sockets", "Test", "UUIDs"]
+git-tree-sha1 = "41dd1395d4dc559f1c2cb558cba784ef37b561fe"
+uuid = "28b8d3ca-fb5f-59d9-8090-bfdbd6d07a71"
+version = "0.47.0"
+
+[[GeometryTypes]]
+deps = ["ColorTypes", "FixedPointNumbers", "LinearAlgebra", "StaticArrays"]
+git-tree-sha1 = "9d7520999ca80a51f1bf41be2268a9ac0e4f0619"
+uuid = "4d00f742-c7ba-57c2-abde-4428a4b178cb"
+version = "0.8.1"
+
+[[Inflate]]
+deps = ["Pkg", "Printf", "Random", "Test"]
+git-tree-sha1 = "b7ec91c153cf8bff9aff58b39497925d133ef7fd"
+uuid = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9"
+version = "0.1.1"
+
 [[InteractiveUtils]]
 deps = ["Markdown"]
 uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 
+[[JSON]]
+deps = ["Dates", "Mmap", "Parsers", "Unicode"]
+git-tree-sha1 = "b34d7cef7b337321e97d22242c3c2b91f476748e"
+uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
+version = "0.21.0"
+
+[[LAME_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "a46bff84977753fdba8db3c50db1435bb1eb4288"
+uuid = "c1c5ebd0-6772-5130-a774-d5fcae4a789d"
+version = "3.100.0+0"
+
 [[LibGit2]]
 uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
 
+[[LibVPX_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "e3549ca9bf35feb9d9d954f4c6a9032e92f46e7c"
+uuid = "dd192d2f-8180-539f-9fb4-cc70b1dcf69a"
+version = "1.8.1+1"
+
 [[Libdl]]
 uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
 
+[[LightGraphs]]
+deps = ["ArnoldiMethod", "DataStructures", "Distributed", "Inflate", "LinearAlgebra", "Random", "SharedArrays", "SimpleTraits", "SparseArrays", "Statistics"]
+git-tree-sha1 = "f40c4dbcd957cc3afc8cca0ff26c9f8304def00d"
+uuid = "093fc24a-ae57-5d10-9952-331d41423f4d"
+version = "1.3.1"
+
 [[LinearAlgebra]]
 deps = ["Libdl"]
 uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -69,27 +228,94 @@ uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 [[Logging]]
 uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
 
+[[LsqFit]]
+deps = ["Distributions", "LinearAlgebra", "NLSolversBase", "OptimBase", "Random", "StatsBase", "Test"]
+git-tree-sha1 = "186c2afbdb3cd52191078cfc6176f7084ed9dfb7"
+uuid = "2fda8390-95c7-5789-9bda-21331edee243"
+version = "0.8.1"
+
+[[MacroTools]]
+deps = ["DataStructures", "Markdown", "Random"]
+git-tree-sha1 = "07ee65e03e28ca88bc9a338a3726ae0c3efaa94b"
+uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
+version = "0.5.4"
+
 [[Markdown]]
 deps = ["Base64"]
 uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
 
+[[Measures]]
+git-tree-sha1 = "e498ddeee6f9fdb4551ce855a46f54dbd900245f"
+uuid = "442fdcdd-2543-5da2-b0f3-8c86c306513e"
+version = "0.3.1"
+
 [[Missings]]
 deps = ["DataAPI"]
 git-tree-sha1 = "de0a5ce9e5289f27df672ffabef4d1e5861247d5"
 uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
 version = "0.4.3"
 
+[[Mmap]]
+uuid = "a63ad114-7e13-5084-954f-fe012c677804"
+
+[[NLSolversBase]]
+deps = ["DiffResults", "Distributed", "FiniteDiff", "ForwardDiff"]
+git-tree-sha1 = "7c4e66c47848562003250f28b579c584e55becc0"
+uuid = "d41bc354-129a-5804-8e4c-c37616107c6c"
+version = "7.6.1"
+
+[[NaNMath]]
+git-tree-sha1 = "928b8ca9b2791081dc71a51c55347c27c618760f"
+uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3"
+version = "0.3.3"
+
+[[NearestNeighborDescent]]
+deps = ["DataStructures", "Distances", "LightGraphs", "Random", "Reexport", "SparseArrays"]
+git-tree-sha1 = "77ac6b8529e22ee61d2322db7739579d18cd6d19"
+uuid = "dd2c4c9e-a32f-5b2f-b342-08c2f244fce8"
+version = "0.3.0"
+
+[[NearestNeighbors]]
+deps = ["Distances", "StaticArrays"]
+git-tree-sha1 = "8bc6180f328f3c0ea2663935db880d34c57d6eae"
+uuid = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
+version = "0.4.4"
+
+[[Ogg_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "f4d4d03c562f40652c0baabd3e4cb2e756e157b7"
+uuid = "e7412a2a-1a6e-54c0-be00-318e2571c051"
+version = "1.3.3+0"
+
 [[OpenBLAS_jll]]
 deps = ["Libdl", "Pkg"]
-git-tree-sha1 = "e2551d7c25d52f35b76d86a50917a3ba8988f519"
+git-tree-sha1 = "adc45e596df7007d48bf6829efb1dc64fdec3ddc"
 uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
-version = "0.3.7+5"
+version = "0.3.7+6"
 
-[[OpenSpecFun_jll]]
+[[OpenSSL_jll]]
 deps = ["Libdl", "Pkg"]
-git-tree-sha1 = "65f672edebf3f4e613ddf37db9dcbd7a407e5e90"
+git-tree-sha1 = "33661eb9d5484220b4367d067f499b30bafc9c12"
+uuid = "458c3c95-2e84-50aa-8efc-19380b2a3a95"
+version = "1.1.1+1"
+
+[[OpenSpecFun_jll]]
+deps = ["CompilerSupportLibraries_jll", "Libdl", "Pkg"]
+git-tree-sha1 = "d110040968b9afe95c6bd9c6233570b0fe8abd22"
 uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
-version = "0.5.3+1"
+version = "0.5.3+2"
+
+[[OptimBase]]
+deps = ["Compat", "NLSolversBase", "Printf", "Reexport", "Test"]
+git-tree-sha1 = "92667ab46a66ad502ec3044f65c41ea68b2e0e9c"
+uuid = "87e2bd06-a317-5318-96d9-3ecbac512eee"
+version = "2.0.0"
+
+[[Opus_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "c9931bf2fcdb57b48c227395c61ea82603212f7d"
+uuid = "91d4177d-7536-5919-b921-800302f37372"
+version = "1.3.1+0"
 
 [[OrderedCollections]]
 deps = ["Random", "Serialization", "Test"]
@@ -103,10 +329,34 @@ git-tree-sha1 = "5f303510529486bb02ac4d70da8295da38302194"
 uuid = "90014a1f-27ba-587c-ab20-58faa44d9150"
 version = "0.9.11"
 
+[[Parsers]]
+deps = ["Dates", "Test"]
+git-tree-sha1 = "0c16b3179190d3046c073440d94172cfc3bb0553"
+uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
+version = "0.3.12"
+
 [[Pkg]]
 deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Test", "UUIDs"]
 uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 
+[[PlotThemes]]
+deps = ["PlotUtils", "Requires", "Statistics"]
+git-tree-sha1 = "df772cc7c78862da96af1ee85cd0111c6640e44e"
+uuid = "ccf2f8ad-2431-5c83-bf29-c5338b663b6a"
+version = "1.0.1"
+
+[[PlotUtils]]
+deps = ["Colors", "Dates", "Printf", "Random", "Reexport"]
+git-tree-sha1 = "a146cb72ec962aec81d478de49d1011db06dd754"
+uuid = "995b91a9-d308-5afd-9ec6-746e21dbc043"
+version = "0.6.3"
+
+[[Plots]]
+deps = ["Base64", "Contour", "Dates", "FFMPEG", "FixedPointNumbers", "GR", "GeometryTypes", "JSON", "LinearAlgebra", "Measures", "NaNMath", "Pkg", "PlotThemes", "PlotUtils", "Printf", "REPL", "Random", "RecipesBase", "Reexport", "Requires", "Showoff", "SparseArrays", "Statistics", "StatsBase", "UUIDs"]
+git-tree-sha1 = "51d4d9154e71253abd2a7df2ee0e3d6b8d14f8b1"
+uuid = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
+version = "0.29.5"
+
 [[Printf]]
 deps = ["Unicode"]
 uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
@@ -125,11 +375,34 @@ uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
 deps = ["Serialization"]
 uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 
+[[RecipesBase]]
+git-tree-sha1 = "b4ed4a7f988ea2340017916f7c9e5d7560b52cae"
+uuid = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
+version = "0.8.0"
+
+[[Reexport]]
+deps = ["Pkg"]
+git-tree-sha1 = "7b1d07f411bc8ddb7977ec7f377b97b158514fe0"
+uuid = "189a3867-3050-52da-a836-e630ba90ab69"
+version = "0.2.0"
+
+[[Requires]]
+deps = ["UUIDs"]
+git-tree-sha1 = "d37400976e98018ee840e0ca4f9d20baa231dc6b"
+uuid = "ae029012-a4dd-5104-9daa-d747884805df"
+version = "1.0.1"
+
 [[Rmath]]
-deps = ["BinaryProvider", "Libdl", "Random", "Statistics"]
-git-tree-sha1 = "2bbddcb984a1d08612d0c4abb5b4774883f6fa98"
+deps = ["Random", "Rmath_jll"]
+git-tree-sha1 = "86c5647b565873641538d8f812c04e4c9dbeb370"
 uuid = "79098fc4-a85e-5d69-aa6a-4863f24498fa"
-version = "0.6.0"
+version = "0.6.1"
+
+[[Rmath_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "1660f8fefbf5ab9c67560513131d4e933012fc4b"
+uuid = "f50d1b31-88e8-58de-be2c-1cc44531875f"
+version = "0.2.2+0"
 
 [[SHA]]
 uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
@@ -137,6 +410,22 @@ uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
 [[Serialization]]
 uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 
+[[SharedArrays]]
+deps = ["Distributed", "Mmap", "Random", "Serialization"]
+uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
+
+[[Showoff]]
+deps = ["Dates"]
+git-tree-sha1 = "e032c9df551fb23c9f98ae1064de074111b7bc39"
+uuid = "992d4aef-0814-514b-bc4d-f2e9a6c4116f"
+version = "0.3.1"
+
+[[SimpleTraits]]
+deps = ["InteractiveUtils", "MacroTools"]
+git-tree-sha1 = "2bdf3b6300a9d66fe29ee8bb51ba100c4df9ecbc"
+uuid = "699a6c99-e7fa-54fc-8d76-47d257e15c1d"
+version = "0.9.1"
+
 [[Sockets]]
 uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
 
@@ -156,15 +445,21 @@ git-tree-sha1 = "e19b98acb182567bcb7b75bb5d9eedf3a3b5ec6c"
 uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
 version = "0.10.0"
 
+[[StaticArrays]]
+deps = ["LinearAlgebra", "Random", "Statistics"]
+git-tree-sha1 = "5a3bcb6233adabde68ebc97be66e95dcb787424c"
+uuid = "90137ffa-7385-5640-81b9-e52037218182"
+version = "0.12.1"
+
 [[Statistics]]
 deps = ["LinearAlgebra", "SparseArrays"]
 uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [[StatsBase]]
 deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics"]
-git-tree-sha1 = "be5c7d45daa449d12868f4466dbf5882242cf2d9"
+git-tree-sha1 = "19bfcb46245f69ff4013b3df3b977a289852c3a1"
 uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
-version = "0.32.1"
+version = "0.32.2"
 
 [[StatsFuns]]
 deps = ["Rmath", "SpecialFunctions"]
@@ -180,9 +475,51 @@ uuid = "4607b0f0-06f3-5cda-b6b1-a6196a1729e9"
 deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
 uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
+[[UMAP]]
+deps = ["Arpack", "Distances", "LinearAlgebra", "LsqFit", "NearestNeighborDescent", "Random", "SparseArrays"]
+git-tree-sha1 = "cfb648199a80ecb46eac88135714e38c23ff860d"
+uuid = "c4f8c510-2410-5be4-91d7-4fbaeb39457e"
+version = "0.1.5"
+
 [[UUIDs]]
 deps = ["Random", "SHA"]
 uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 
 [[Unicode]]
 uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
+
+[[Zlib_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "fd36a6739e256527287c5444960d0266712cd49e"
+uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
+version = "1.2.11+8"
+
+[[libass_jll]]
+deps = ["Bzip2_jll", "FreeType2_jll", "FriBidi_jll", "Libdl", "Pkg", "Zlib_jll"]
+git-tree-sha1 = "3fd3ea3525f2e3d337c54a52b2ca78a5a272bbf5"
+uuid = "0ac62f75-1d6f-5e53-bd7c-93b484bb37c0"
+version = "0.14.0+0"
+
+[[libfdk_aac_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "0e4ace600c20714a8dd67700c4502714d8473e8e"
+uuid = "f638f0a6-7fb0-5443-88ba-1cc74229b280"
+version = "0.1.6+1"
+
+[[libvorbis_jll]]
+deps = ["Libdl", "Ogg_jll", "Pkg"]
+git-tree-sha1 = "71e54fb89ac3e0344c7185d1876fd96b0f246952"
+uuid = "f27f6e37-5d2b-51aa-960f-b287f2bc3b7a"
+version = "1.3.6+2"
+
+[[x264_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "23664c0757c3740050ca0e22944c786c165ca25a"
+uuid = "1270edf5-f2f9-52d2-97e9-ab00b5d0237a"
+version = "2019.5.25+1"
+
+[[x265_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "9345e417084421a8e91373d6196bc58e660eed2a"
+uuid = "dfaa095f-4041-5dcd-9319-2fabd8486b76"
+version = "3.0.0+0"
diff --git a/Project.toml b/Project.toml
index 65ba1d2..a3f7a69 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,9 +4,12 @@ authors = ["Jonathan Chang <slycoder @gmail.com>"]
 version = "0.1.0"
 
 [deps]
+Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
+UMAP = "c4f8c510-2410-5be4-91d7-4fbaeb39457e"
 
 [compat]
 julia = "1.3"
diff --git a/examples/LDA.jl b/examples/LDA.jl
index 16f7183..9360e0f 100644
--- a/examples/LDA.jl
+++ b/examples/LDA.jl
@@ -1,14 +1,41 @@
-using TopicModels
+using TopicModels, Plots, UMAP
 
-exdir = Pkg.dir("TopicModels", "examples")
+exdir = joinpath(dirname(pathof(TopicModels)), "..", "examples")
 
-testDocuments = readDocuments(open(joinpath(exdir, "cora.documents")))
+testDocuments = readDocs(open(joinpath(exdir, "cora.documents")))
 testLexicon = readLexicon(open(joinpath(exdir, "cora.lexicon")))
 
-corpus = Corpus(testDocuments)
+corpus = Corpus(testDocuments,testLexicon)
+model = Model(fill(0.1, 10), fill(0.01,length(testLexicon)), corpus)
+state = State(model,corpus)
 
-model = Model(fill(0.1, 10), 0.01, length(testLexicon), corpus)
+#@time Juno.@run trainModel(model, state, 30)
+@time trainModel(model, state, 30)
+topWords = topTopicWords(model, state, 10)
 
-@time trainModel(model, 30)
+embedding = umap(state.topics, 2, n_neighbors=10)
+maxlabels = vec(map(i->i[1], findmax(state.topics,dims=1)[2]))
+scatter(embedding[1,:], embedding[2,:], zcolor=maxlabels, title="UMAP: Max on Learned", marker=(2, 2, :auto, stroke(0)))
 
-topWords = topTopicWords(model, testLexicon, 21)
+
+k = 10
+lexLength = 1000
+corpLambda = 1000 # poisson parameter for random doc length
+corpLength = 100
+scaleK = 0.01
+scaleL = 0.01
+testCorpus = LdaCorpus(k, lexLength, corpLambda, corpLength, scaleK, scaleL)
+
+testModel = Model(testCorpus.alpha, testCorpus.beta, testCorpus)
+testState = State(testModel, testCorpus)
+@time trainModel(testModel, testState, 100)
+
+# compute validation metrics on a single fit
+CorpusARI(testState,testModel,testCorpus)
+DocsARI(testState,testCorpus)
+
+# visualize the fit
+@time embedding = umap(testState.topics, 2;n_neighbors=10)
+
+maxlabels = vec(map(i->i[1], findmax(CorpusTopics(testCorpus),dims=1)[2]))
+scatter(embedding[1,:], embedding[2,:], zcolor=maxlabels, title="UMAP: True on Learned", marker=(2, 2, :auto, stroke(0)))
diff --git a/src/Computation.jl b/src/Computation.jl
new file mode 100644
index 0000000..13c8266
--- /dev/null
+++ b/src/Computation.jl
@@ -0,0 +1,251 @@
+struct Model
+  alphaPrior::Vector{Float64} # concentration parameter for the symmetric Dirichlet prior on document topics
+  betaPrior::Vector{Float64} # concentration parameter for the symmetric Dirichlet prior on words
+  corpus::AbstractCorpus
+
+  # initialize an untrained model
+  Model(alphaPrior::Vector{Float64},
+        betaPrior::Vector{Float64},
+        corpus::AbstractCorpus) = begin
+    K = length(alphaPrior)
+    m = new(
+      alphaPrior,
+      betaPrior,
+      corpus)
+    return m
+  end
+
+  # initialize a trained model
+  Model(trainedModel::Model,
+        corpus::AbstractCorpus) = begin
+    m = new(
+      trainedModel.alphaPrior,
+      trainedModel.betaPrior,
+      corpus
+    )
+    return m
+  end
+end
+
+struct State
+  topics::Array{Float64,2}
+  topicSums::Vector{Float64}
+  docSums::Array{Float64,2}
+  assignments::Array{Array{Int64,1},1}
+  conditionals::Array{Array{Float64,2},1} # the p paramter for the word assignment (cat/multinom) variable
+  frozen::Bool
+
+  # randomly initialize the state
+  State(model::Model,
+        corpus::AbstractCorpus) = begin # length of the lexicon
+    K = length(model.alphaPrior)
+    s = new(
+      zeros(Float64, K, length(corpus.lexicon)), # topics
+      zeros(Float64, K), # topicSums
+      zeros(Float64, K, length(corpus.docs)), #docSums
+      fill(Array{Int64,1}(undef,0), length(corpus.docs)), # assignments
+      fill(Array{Int64,2}(undef,0,K), length(corpus.docs)),
+      false
+    )
+    initializeAssignments(model,s,corpus)
+    return s
+  end
+
+  # initialize the state from a trained model
+  State(topics::Array{Float64,2},
+        topicSums::Vector{Float64},
+        docSums::Array{Float64,2},
+        assignments::Array{Array{Int64,1},1},
+        conditionals::Array{Array{Float64,2},1},
+        frozen::Bool) = begin # length of the lexicon
+    s = new(
+      topics,
+      topicSums,
+      docSums,
+      assignmens,
+      conditionals,
+      frozen
+    )
+    return s
+  end
+end
+
+function AllTopics(state::State)
+  alltopics = []
+  for i in 1:length(state.assignments)
+    append!(alltopics,state.assignments[i])
+  end
+  return convert(Array{Int,1},alltopics)
+end
+
+function initializeAssignments(model::Model,state::State,corpus::AbstractCorpus)
+  for dd in 1:length(corpus)
+    @inbounds words = corpus.docs[dd].terms
+    @inbounds state.assignments[dd] = zeros(length(words))
+    @inbounds state.conditionals[dd] = zeros(length(words), length(model.alphaPrior))
+    for ww in 1:length(words)
+      @inbounds word = words[ww]
+      @inbounds state.conditionals[dd][ww,:] = model.alphaPrior
+      topic = sampleMultinomial(ww,dd,state)
+      @inbounds state.assignments[dd][ww] = topic
+      updateSufficientStatistics(word, topic, dd,
+                                  model.corpus.weights[dd][ww],
+                                  state)
+    end
+  end
+  return
+end
+
+
+function sampleMultinomial(word_ind::Int64,
+                           document::Int64,
+                           state::State)
+  cond = state.conditionals[document][word_ind,:]
+  pSum = sum(cond)
+  r = rand() * pSum
+  K = length(cond)
+  for k in 1:K
+    if r < cond[k]
+      return k
+    else
+      @inbounds r -= cond[k]
+    end
+  end
+  return 0
+end
+
+function cond_word(word::Int,
+                   word_ind::Int,
+                   document::Int,
+                   model::Model,
+                   state::State)
+  V = size(state.topics, 2)
+  for ii in 1:length(model.alphaPrior)
+    @inbounds state.conditionals[document][word_ind,ii] =
+        (state.docSums[ii, document] + model.alphaPrior[ii]) *
+        (state.topics[ii, word] + model.betaPrior[word]) /
+        (state.topicSums[ii] + V * model.betaPrior[word])
+  end
+  return
+end
+
+function log_beta(x::Vector{Float64})
+  # compute natural log of the multivariate beta function
+  lb = sum(loggamma.(x))
+  lb -= loggamma(sum(x))
+end
+
+function joint_log_p(model::Model,
+                     state::State)
+  #calculate the full joint log likelihood, this is usefull for testing
+  log_pz = 0
+  for k in 1:length(model.alphaPrior)
+    @inbounds log_pz += (log_beta(state.topics[k,:] .+ model.betaPrior) -
+                log_beta(model.betaPrior))
+  end
+  for d in 1:length(model.corpus)
+    @inbounds log_pz += (log_beta(state.docSums[:,d] .+ model.alphaPrior) -
+                log_beta(model.alphaPrior))
+  end
+  return log_pz
+end
+
+function sampleWord(word::Int,
+                    word_ind::Int,
+                    document::Int,
+                    model::Model,
+                    state::State)
+  cond_word(word, word_ind, document, model, state)
+  sampleMultinomial(word_ind, document, state)
+end
+
+
+function updateSufficientStatistics(word::Int64,
+                                    topic::Int64,
+                                    document::Int64,
+                                    scale::Float64,
+                                    state::State)
+  fr = Float64(!state.frozen)
+  @inbounds state.docSums[topic, document] += scale
+  @inbounds state.topicSums[topic] += scale * fr
+  @inbounds state.topics[topic, word] += scale * fr
+  return
+end
+
+@doc raw"""
+    getTermDist(state::State, model::Model)
+
+Compute ``\phi_{k,v} = \frac{\Psi_{k,v} + \beta_t}{\left( \sum^V_{v'=1} \Psi_{k,v'} + \beta_{v'}\right)}``
+
+Where ``\vec{ϕ_v}`` parameterizes the V-dimensional categorical distribution of a word.
+
+Updates the `termDist` attribute of `state`
+"""
+function getTermDist(state::State, model::Model)
+  Phi = Array{Float64,2}(undef,length(model.alphaPrior),length(model.betaPrior))
+  for topic in 1:length(model.alphaPrior)
+    Phi[topic,:] = (state.topics[topic,:] .+ model.betaPrior) ./ (state.topicSums[topic] + sum(model.betaPrior))
+  end
+  return Phi
+end
+
+@doc raw"""
+    getTopicDist(state::State, model::Model)
+
+Compute ``\theta_{k,m} = \frac{\Omega_{k,m} + \alpha_k}{\left( \sum^K_{k'=1} \Omega_{k,m'} + \alpha_{k'}\right)}``
+
+Where ``\vec{\theta_m}`` parameterizes the K-dimensional categorical distribution of a document.
+
+Updates the `topicDist` attribute of `state`
+"""
+function getTopicDist(state::State, model::Model)
+  Theta = Array{Float64,2}(undef,length(model.alphaPrior),length(model.corpus))
+  for doc in 1:length(model.corpus)
+    Theta[:,doc] = (state.docSums[:,doc] .+ model.alphaPrior) ./ (sum(state.docSums[:,doc]) + sum(model.alphaPrior))
+  end
+  return Theta
+end
+
+function sampleDocument(document::Int,
+                        model::Model,
+                        state::State)
+  words = model.corpus.docs[document].terms
+  Nw = length(words)
+  @inbounds weights = model.corpus.weights[document]
+  K = length(model.alphaPrior)
+  @inbounds assignments = state.assignments[document]
+  for ii in 1:Nw
+    word = words[ii]
+    oldTopic = assignments[ii]
+    updateSufficientStatistics(word, oldTopic, document, -weights[ii], state)
+    newTopic = sampleWord(word, ii, document, model, state)
+    @inbounds assignments[ii] = newTopic
+    updateSufficientStatistics(word, newTopic, document, weights[ii], state)
+  end
+  return
+end
+
+function sampleCorpus(model::Model, state::State)
+  for ii in 1:length(model.corpus)
+    sampleDocument(ii, model, state)
+  end
+  return
+end
+
+# The functions below are designed for public consumption
+function trainModel(model::Model,
+                    state::State,
+                    numIterations::Int64)
+  for ii in 1:numIterations
+    println(string("Iteration ", ii, "..."))
+    sampleCorpus(model, state)
+  end
+  return
+end
+
+function topTopicWords(model::Model,
+                       state::State,
+                       numWords::Int64)
+  [model.corpus.lexicon[reverse(sortperm(state.topics'[1:end, row]))[1:numWords]]
+   for row in 1:size(state.topics,1)]
+end
diff --git a/src/Data.jl b/src/Data.jl
new file mode 100644
index 0000000..fb5379a
--- /dev/null
+++ b/src/Data.jl
@@ -0,0 +1,163 @@
+### Document
+abstract type AbstractDocument end
+
+mutable struct LdaDocument <: AbstractDocument
+  # this is a fully observed data from the LDA model
+  theta::Array{Float64,1} # the topic probs for the doc
+  z::Array{Int64,1} # the topic for each word
+  terms::Array{Int64,1} # the word tokens
+
+  LdaDocument(alpha::Array{Float64,1},
+                  P::Array{Float64,2},
+                  N::Int64) = begin # length of the doc
+    d = new(
+      Array{Float64,1}(undef,size(P,2)),
+      Array{Int64,1}(undef,N),
+      Array{Int64,1}(undef,N)
+    )
+    GenerateDoc(d,alpha,P)
+    return d
+  end
+
+  LdaDocument(theta::Array{Float64,1},
+              z::Array{Int64,1},
+              terms::Array{Int64,1}) = begin
+    d = new(theta,z,N)
+    return d
+  end
+end
+
+function GenerateDoc(doc::LdaDocument,
+                     alpha::Array{Float64,1},
+                     Phi::Array{Float64,2})
+    dd = Dirichlet(alpha)
+    doc.theta = vec(rand(dd,1))
+    cat = Categorical(vec(doc.theta))
+    doc.z = rand(cat,length(doc))
+    for i in 1:length(doc)
+        @inbounds dc = Categorical(Phi[:,doc.z[i]])
+        @inbounds doc.terms[i] = rand(dc,1)[1]
+    end
+    return
+end
+
+mutable struct Document <: AbstractDocument
+  #this is actual data, where only the terms are observed
+  terms::Array{Int64,1} # the word tokens
+  Document(terms::Array{Int64,1}) = new(terms)
+end
+
+function length(doc::AbstractDocument)
+  return size(doc.terms,1)
+end
+
+### Corpus
+abstract type AbstractCorpus end
+
+mutable struct LdaCorpus <: AbstractCorpus
+  # this is a fully observed data from the LDA model
+  docs::Array{LdaDocument,1}
+  alpha::Array{Float64,1}
+  beta::Array{Float64,1}
+  Phi::Array{Float64,2}
+  weights::Array{Array{Float64,1}} # only unweighted terms supported
+  lexicon::Array{String,1}
+
+  LdaCorpus(k::Int64,
+            lexLength::Int64,
+            corpLambda::Int64,
+            corpLength::Int64,
+            scaleK::Float64,
+            scaleL::Float64) = begin # length of the doc
+    w = Array{Array{Float64,1},1}(undef,corpLength)
+    lex = string.([1:1:lexLength;]) # there is no
+    a = fill(scaleK,k) # scale parameter for the Dirichlet topic prior
+    b = fill(scaleL,lexLength) # scale parameter for the Dirichlet token prior
+    dl = Poisson(corpLambda)
+    docLengths = rand(dl,corpLength) # the lengths of the docs in the corpus
+    db = Dirichlet(b)
+    P = rand(db,k) # the Dirichlet token prior, containing one lexLength vector for each k
+    d = Array{LdaDocument,1}(undef,corpLength)
+    for i in 1:corpLength
+      w[i] = ones(docLengths[i])
+      @inbounds d[i] = LdaDocument(a,P,docLengths[i])
+    end
+    return new(d, a, b, P, w, lex)
+  end
+
+  LdaCorpus(docs::Array{LdaDocument,1}, # the documents
+            alpha::Array{Float64,1},
+            beta::Array{Float64,1},
+            Phi::Array{Float64,2},
+            weights::Array{Float64,1},
+            lexicon::Array{String,1}) = begin
+    c = new(docs,alpha,beta,Phi,weights)
+    return c
+  end
+end
+
+function CorpusTopics(corpus::LdaCorpus)
+  cat(dims=2,map(i->vec(i.theta), corpus.docs)...) # get a 2d array of (document wise) mixed membership for the corpus
+end
+
+function AllTopics(corpus::LdaCorpus)
+  alltopics = []
+  for i in 1:length(corpus)
+    append!(alltopics,corpus.docs[i].z)
+  end
+  return convert(Array{Int,1},alltopics)
+end
+
+struct Corpus <: AbstractCorpus
+  docs::Array{Document,1}
+  weights::Array{Array{Float64,1},1}
+  lexicon::Array{String,1}
+
+  Corpus(docs::Array{Document,1},
+         weights::Array{Array{Float64,1},1},
+         lexicon::Array{String,1}) = begin
+    return new(
+      docs,
+      weights,
+      lexicon
+    )
+  end
+
+  Corpus(docs::Array{Document,1},
+         lexicon::Array{String,1}) = begin
+    return new(
+      docs,
+      map(x -> ones(Float64,length(x)), docs), # no weights
+      lexicon
+    )
+  end
+end
+
+function length(corpus::AbstractCorpus)
+  return length(corpus.docs)
+end
+
+# Expand  a term:count pair into a <count>-length sequence [term, term, ....]
+function termToWordSequence(term::AbstractString)
+  parts = split(term, ":")
+  fill(parse(Int64, parts[1]) + 1, parse(Int64, parts[2]))
+end
+
+function readDocs(stream)
+    corpus = readlines(stream)
+    docs = Array{Document,1}(undef,length(corpus))
+    for i in 1:length(corpus)
+      @inbounds terms = split(corpus[i], " ")[2:end]
+      @inbounds docs[i] = Document(termToWordSequence(terms[1]))
+      for ii in 2:length(terms)
+        @inbounds append!(docs[i].terms, termToWordSequence(terms[ii]))
+      end
+    end
+    return docs
+end
+
+function readLexicon(stream)
+  lines = readlines(stream)
+  chomped = map(chomp, convert(Array{AbstractString,1}, lines))
+  convert(Array{String,1},chomped) # convert from substrings
+end
diff --git a/src/TopicModels.jl b/src/TopicModels.jl
index 2990d26..056937f 100644
--- a/src/TopicModels.jl
+++ b/src/TopicModels.jl
@@ -1,210 +1,34 @@
 module TopicModels
 
+#Imports
 import Base.length
 
-RaggedMatrix{T} = Array{Array{T,1},1}
-
-struct Corpus
-  documents::RaggedMatrix{Int64}
-  weights::RaggedMatrix{Float64}
-
-  Corpus(documents::RaggedMatrix{Int64},
-         weights::RaggedMatrix{Float64}) = begin
-    return new(
-      documents,
-      weights
-    )
-  end
-
-  Corpus(documents::RaggedMatrix{Int64}) = begin
-    weights = map(documents) do doc
-      ones(Float64, length(doc))
-    end
-    return new(
-      documents,
-      weights
-    )
-  end
-end
-
-struct Model
-  alphaPrior::Vector{Float64}
-  betaPrior::Float64
-  topics::Array{Float64,2}
-  topicSums::Vector{Float64}
-  documentSums::Array{Float64,2}
-  assignments::RaggedMatrix{Int64}
-  frozen::Bool
-  corpus::Corpus
-
-  Model(alphaPrior::Vector{Float64},
-        betaPrior::Float64,
-        V::Int64,
-        corpus::Corpus) = begin
-    K = length(alphaPrior)
-    m = new(
-      alphaPrior,
-      betaPrior,
-      zeros(Float64, K, V), # topics
-      zeros(Float64, K), # topicSums
-      zeros(Float64, K, length(corpus.documents)), #documentSums
-      Array{Array{Int64,1},1}(undef,length(corpus.documents)), # assignments
-      false,
-      corpus
-    )
-    initializeAssignments(m)
-    return m
-  end
-
-  Model(trainedModel::Model, corpus::Corpus) = begin
-    m = new(
-      trainedModel.alphaPrior,
-      trainedModel.betaPrior,
-      trainedModel.topics,
-      trainedModel.topicSums,
-      trainedModel.documentSums,
-      fill(Array(Int64, 0), length(corpus.documents)),
-      true,
-      corpus
-    )
-    initializeAssignments(m)
-    return m
-  end
-end
-
-function length(corpus::Corpus)
-  return length(corpus.documents)
-end
-
-function initializeAssignments(model::Model)
-  for dd in 1:length(model.corpus)
-    @inbounds words = model.corpus.documents[dd]
-    @inbounds model.assignments[dd] = fill(0, length(words))
-    for ww in 1:length(words)
-      @inbounds word = words[ww]
-      topic = sampleMultinomial(model.alphaPrior)
-      @inbounds model.assignments[dd][ww] = topic
-      updateSufficientStatistics(
-        word, topic, dd, model.corpus.weights[dd][ww], model)
-    end
-  end
-  return
-end
-
-function sampleMultinomial(p::Array{Float64,1})
-  pSum = sum(p)
-  r = rand() * pSum
-  K = length(p)
-  for k in 1:K
-    if r < p[k]
-      return k
-    else
-      r -= p[k]
-    end
-  end
-  return 0
-end
-
-function wordDistribution(word::Int,
-                          document::Int,
-                          model::Model,
-                          out::Vector{Float64})
-  V = size(model.topics, 2)
-  for ii in 1:length(out)
-    u = (model.documentSums[ii, document] + model.alphaPrior[ii]) *
-        (model.topics[ii, word] + model.betaPrior) /
-        (model.topicSums[ii] + V * model.betaPrior)
-    @inbounds out[ii] = u
-  end
-  return
-end
-
-function sampleWord(word::Int,
-                    document::Int,
-                    model::Model,
-                    p::Vector{Float64})
-  wordDistribution(word, document, model, p)
-  sampleMultinomial(p)
-end
-
-
-function updateSufficientStatistics(word::Int64,
-                                    topic::Int64,
-                                    document::Int64,
-                                    scale::Float64,
-                                    model::Model)
-  fr = Float64(!model.frozen)
-  @inbounds model.documentSums[topic, document] += scale
-  @inbounds model.topicSums[topic] += scale * fr
-  @inbounds model.topics[topic, word] += scale * fr
-  return
-end
-
-function sampleDocument(document::Int,
-                        model::Model)
-  @inbounds words = model.corpus.documents[document]
-  Nw = length(words)
-  @inbounds weights = model.corpus.weights[document]
-  K = length(model.alphaPrior)
-  p = Array{Float64,1}(undef,K)
-  @inbounds assignments = model.assignments[document]
-  for ii in 1:Nw
-    @inbounds word = words[ii]
-    @inbounds oldTopic = assignments[ii]
-    updateSufficientStatistics(word, oldTopic, document, -weights[ii], model)
-    newTopic = sampleWord(word, document, model, p)
-    @inbounds assignments[ii] = newTopic
-    updateSufficientStatistics(word, newTopic, document, weights[ii], model)
-  end
-  return
-end
-
-function sampleCorpus(model::Model)
-  for ii in 1:length(model.corpus)
-    sampleDocument(ii, model)
-  end
-  return
-end
-
-# Note, files are zero indexed, but we are 1-indexed.
-function termToWordSequence(term::AbstractString)
-  parts = split(term, ":")
-  fill(parse(Int64, parts[1]) + 1, parse(Int64, parts[2]))
-end
-
-# The functions below are designed for public consumption
-function trainModel(model::Model,
-                    numIterations::Int64)
-  for ii in 1:numIterations
-    println(string("Iteration ", ii, "..."))
-    sampleCorpus(model)
-  end
-  return
-end
-
-function topTopicWords(model::Model,
-                       lexicon::Array{String,1},
-                       numWords::Int64)
-  [lexicon[reverse(sortperm(model.topics'[1:end, row]))[1:numWords]]
-   for row in 1:size(model.topics,1)]
-end
-
-function readDocuments(stream)
-    lines = readlines(stream)
-    convert(RaggedMatrix{Int64},
-            [vcat([termToWordSequence(term) for term in split(line, " ")[2:end]]...)
-             for line in lines])
-end
-
-function readLexicon(stream)
-  lines = readlines(stream)
-  convert(Array{String,1},map(chomp, convert(Array{AbstractString,1}, lines)))
-end
+using Random, Distributions, Plots, UMAP
+using SpecialFunctions: loggamma
+using Clustering: randindex
 
+#Exports
 export Corpus,
+       LdaCorpus,
        Model,
-       readDocuments,
+       State,
+       readDocs,
        readLexicon,
+       termToWordSequence,
        topTopicWords,
-       trainModel
-end
+       trainModel,
+       GenerateDoc,
+       CorpusTopics,
+       CorpusARI,
+       DocsARI,
+       sampleDocument
+
+#Data that we make or find in real life:
+include("Data.jl")
+
+#Bayesian learning and inference:
+include("Computation.jl")
+
+#Stuff like perplexity and ARI:
+include("Validation.jl")
+end #module
diff --git a/src/Validation.jl b/src/Validation.jl
new file mode 100644
index 0000000..57710e5
--- /dev/null
+++ b/src/Validation.jl
@@ -0,0 +1,15 @@
+function CorpusARI(state::State,model::Model,corpus::LdaCorpus)
+  #for synthetic data, turn our mixed membership document vectors into max likelihood assignments
+  # and check ARI between the ground truth and the state
+
+  learned_max_clust = map(i->i[1], findmax(getTopicDist(state,model),dims=1)[2])
+  true_max_clust = map(i->i[1], findmax(CorpusTopics(corpus),dims=1)[2])
+  randindex(learned_max_clust,true_max_clust)
+end
+
+function DocsARI(state::State,corpus::LdaCorpus)
+  #for synthetic data, find the topic ARI across [all terms in] all documents in the corpus
+  learned_clust = AllTopics(state)
+  true_clust = AllTopics(corpus)
+  randindex(learned_clust,true_clust)
+end

From 827bb48f6d43cf591591c94f02e8aeed30367a40 Mon Sep 17 00:00:00 2001
From: Matt Karikomi <mattkarikomi@gmail.com>
Date: Fri, 6 Mar 2020 16:54:13 -0800
Subject: [PATCH 4/5] Add unit test for Gibbs sampler, etc 1) Per-word topics:
 add a test for consistency (with the full joint) of the corresponding
 conditional 2) Get rid of mutability on structs in src/Data.jl in favor of
 in-place assignment

---
 examples/LDA.jl          | 15 +++++++----
 src/Data.jl              | 10 ++++----
 src/TopicModels.jl       |  4 +--
 test/Gibbs_unit_tests.jl | 55 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 71 insertions(+), 13 deletions(-)
 create mode 100644 test/Gibbs_unit_tests.jl

diff --git a/examples/LDA.jl b/examples/LDA.jl
index 9360e0f..32027fc 100644
--- a/examples/LDA.jl
+++ b/examples/LDA.jl
@@ -1,5 +1,8 @@
 using TopicModels, Plots, UMAP
 
+##################################################################################################################################
+# Fit and Visualize Real-World Text Data
+
 exdir = joinpath(dirname(pathof(TopicModels)), "..", "examples")
 
 testDocuments = readDocs(open(joinpath(exdir, "cora.documents")))
@@ -13,10 +16,13 @@ state = State(model,corpus)
 @time trainModel(model, state, 30)
 topWords = topTopicWords(model, state, 10)
 
-embedding = umap(state.topics, 2, n_neighbors=10)
+# visualize the fit
+@time embedding = umap(state.topics, 2, n_neighbors=10)
 maxlabels = vec(map(i->i[1], findmax(state.topics,dims=1)[2]))
-scatter(embedding[1,:], embedding[2,:], zcolor=maxlabels, title="UMAP: Max on Learned", marker=(2, 2, :auto, stroke(0)))
+scatter(embedding[1,:], embedding[2,:], zcolor=maxlabels, title="UMAP: Max-Likelihood Doc Topics on Learned", marker=(2, 2, :auto, stroke(0)))
 
+##################################################################################################################################
+# Fit, Validate, and Visualize Synthetic Data Derived from a Fully-Generative Simulation (Poisson-distributed document-length)
 
 k = 10
 lexLength = 1000
@@ -31,11 +37,10 @@ testState = State(testModel, testCorpus)
 @time trainModel(testModel, testState, 100)
 
 # compute validation metrics on a single fit
-CorpusARI(testState,testModel,testCorpus)
-DocsARI(testState,testCorpus)
+CorpusARI(testState,testModel,testCorpus) # ARI for max. likelihood. document topics
+DocsARI(testState,testCorpus) # ARI for actual word topics
 
 # visualize the fit
 @time embedding = umap(testState.topics, 2;n_neighbors=10)
-
 maxlabels = vec(map(i->i[1], findmax(CorpusTopics(testCorpus),dims=1)[2]))
 scatter(embedding[1,:], embedding[2,:], zcolor=maxlabels, title="UMAP: True on Learned", marker=(2, 2, :auto, stroke(0)))
diff --git a/src/Data.jl b/src/Data.jl
index fb5379a..14c8794 100644
--- a/src/Data.jl
+++ b/src/Data.jl
@@ -1,7 +1,7 @@
 ### Document
 abstract type AbstractDocument end
 
-mutable struct LdaDocument <: AbstractDocument
+struct LdaDocument <: AbstractDocument
   # this is a fully observed data from the LDA model
   theta::Array{Float64,1} # the topic probs for the doc
   z::Array{Int64,1} # the topic for each word
@@ -31,9 +31,9 @@ function GenerateDoc(doc::LdaDocument,
                      alpha::Array{Float64,1},
                      Phi::Array{Float64,2})
     dd = Dirichlet(alpha)
-    doc.theta = vec(rand(dd,1))
+    doc.theta .= vec(rand(dd,1))
     cat = Categorical(vec(doc.theta))
-    doc.z = rand(cat,length(doc))
+    doc.z .= rand(cat,length(doc))
     for i in 1:length(doc)
         @inbounds dc = Categorical(Phi[:,doc.z[i]])
         @inbounds doc.terms[i] = rand(dc,1)[1]
@@ -41,7 +41,7 @@ function GenerateDoc(doc::LdaDocument,
     return
 end
 
-mutable struct Document <: AbstractDocument
+struct Document <: AbstractDocument
   #this is actual data, where only the terms are observed
   terms::Array{Int64,1} # the word tokens
   Document(terms::Array{Int64,1}) = new(terms)
@@ -54,7 +54,7 @@ end
 ### Corpus
 abstract type AbstractCorpus end
 
-mutable struct LdaCorpus <: AbstractCorpus
+struct LdaCorpus <: AbstractCorpus
   # this is a fully observed data from the LDA model
   docs::Array{LdaDocument,1}
   alpha::Array{Float64,1}
diff --git a/src/TopicModels.jl b/src/TopicModels.jl
index 056937f..f7508fa 100644
--- a/src/TopicModels.jl
+++ b/src/TopicModels.jl
@@ -17,11 +17,9 @@ export Corpus,
        termToWordSequence,
        topTopicWords,
        trainModel,
-       GenerateDoc,
        CorpusTopics,
        CorpusARI,
-       DocsARI,
-       sampleDocument
+       DocsARI
 
 #Data that we make or find in real life:
 include("Data.jl")
diff --git a/test/Gibbs_unit_tests.jl b/test/Gibbs_unit_tests.jl
new file mode 100644
index 0000000..3998c47
--- /dev/null
+++ b/test/Gibbs_unit_tests.jl
@@ -0,0 +1,55 @@
+using Test, TopicModels, Random
+using TopicModels: updateSufficientStatistics, joint_log_p #non-exported fns we need
+
+
+# use the equality of likelihood ratio to test that the conditional distribution is consistent with the joint distribution
+@testset "LDA docs" begin
+    # generate some data from LDA where the doclength is Poisson
+    k = 7
+    lexLength = 10
+    corpLambda = 10 # poisson parameter for random doc length
+    corpLength = 10
+    scaleK = 0.1
+    scaleL = 0.1
+    Random.seed!(123)
+
+    corpus = LdaCorpus(k, lexLength, corpLambda, corpLength, scaleK, scaleL)
+
+    model = Model(corpus.alpha, corpus.beta, corpus)
+    state = State(model, corpus)
+    trainModel(model, state, 10) # update all the state variables
+
+    # pick a random doc/word to iterate the sampler
+    doc_ind = rand(1:corpLength)
+    word_ind = rand(1:length(corpus.docs[doc_ind]))
+    word = corpus.docs[doc_ind].terms[word_ind]
+
+    conditional = state.conditionals[doc_ind][word_ind,:]
+    oldTopic = copy(state.assignments[doc_ind][word_ind])  # the original word token
+
+    newTopic = rand(collect(1:k)[1:end .!= oldTopic],1) # a different word token
+    newTopic = Int64(newTopic[1])
+
+    #get the original state probs
+    joint_Lw = copy(joint_log_p(model,state)) # log prob of the full joint under original topic for <doc><word>
+    cond_Lw = log(state.conditionals[doc_ind][word_ind,oldTopic]/sum(state.conditionals[doc_ind][word_ind,:])) # log conditional p(z=k|...)
+    cond_Lw_new = log(state.conditionals[doc_ind][word_ind,newTopic]/sum(state.conditionals[doc_ind][word_ind,:])) # log conditional p(z=k|...)
+
+    updateSufficientStatistics(word, oldTopic, doc_ind, -model.corpus.weights[doc_ind][word_ind], state) #remove counts for the old topic
+    updateSufficientStatistics(word, newTopic, doc_ind, model.corpus.weights[doc_ind][word_ind], state) #update stats for new topic
+    joint_Lw_new = copy(joint_log_p(model,state)) # log prob of the full joint under original topic for <doc><word>
+
+    print("joint_Lw: ", joint_Lw, "\n")
+    print("cond_Lw: ", cond_Lw, "\n")
+
+    print("joint_Lw_new: ", joint_Lw_new, "\n")
+    print("cond_Lw_new: ", cond_Lw_new, "\n")
+
+    print("joint_LR: ", joint_Lw_new-joint_Lw, "\n")
+    print("cond_LR: ", cond_Lw_new-cond_Lw, "\n")
+    print("old Topic: ", oldTopic, "\n")
+    print("new Topic: ", newTopic, "\n")
+
+    # why is this happenning?  normalizers should cancel for both ratios so our propto sampling dist should work
+    @test isless(abs(joint_Lw_new-joint_Lw - cond_Lw_new+cond_Lw),1e-5)
+end

From 16cc7903e9997ca5ec084987f346d0a6da82700b Mon Sep 17 00:00:00 2001
From: Matt Karikomi <mattkarikomi@gmail.com>
Date: Sun, 8 Mar 2020 11:48:27 -0700
Subject: [PATCH 5/5] comments in gibbs tests

---
 test/Gibbs_unit_tests.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/Gibbs_unit_tests.jl b/test/Gibbs_unit_tests.jl
index 3998c47..fda7937 100644
--- a/test/Gibbs_unit_tests.jl
+++ b/test/Gibbs_unit_tests.jl
@@ -50,6 +50,5 @@ using TopicModels: updateSufficientStatistics, joint_log_p #non-exported fns we
     print("old Topic: ", oldTopic, "\n")
     print("new Topic: ", newTopic, "\n")
 
-    # why is this happenning?  normalizers should cancel for both ratios so our propto sampling dist should work
     @test isless(abs(joint_Lw_new-joint_Lw - cond_Lw_new+cond_Lw),1e-5)
 end