From 496994673b4a915d86992be6780c1b7ce50b57ea Mon Sep 17 00:00:00 2001 From: Matt Karikomi Date: Sat, 29 Feb 2020 21:37:45 -0800 Subject: [PATCH] v1.3 compat v1.3 compat fixed lexicon --- .DS_Store | Bin 0 -> 6148 bytes Manifest.toml | 188 +++++++++++++++++++++++++++++++++++++++++++++ Project.toml | 18 +++++ REQUIRE | 0 src/TopicModels.jl | 34 ++++---- 5 files changed, 223 insertions(+), 17 deletions(-) create mode 100644 .DS_Store create mode 100644 Manifest.toml create mode 100644 Project.toml delete mode 100644 REQUIRE diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..8b6cf85eb3dc8b3f099f156b7f1f2012e201d7ab GIT binary patch literal 6148 zcmeHK%}T>S5Z-O8ZYyFAq8@Yc)n$)1NlDE(|@)3L; zXLdJYvEWU_&cNLUhY(d!?ONMe)~MS~ z)0x%ntknX&xqmR5RmIMpb9mCbdw9&AX3sC0T@K_c*)&+dD=4P8dv6krWONH7MRXBG zNDL4I!~ij{WDJ;tK&&rW-Bd0yKn(oM0PYVqG(^{6u2F3r(BbtN{S`zM(D5ySXd845 z<{H5R!gVU3PUYr_!F4)}+a}I6m}}JOjH{Jl9Pi2-6@o`JG9 zEj<5^;g?zX$e&Lkix?mV{uu+j((yWND9W6z-^#`+sePmZT dwckb^;#`BdMw|u5RXQMD1Qa3E5d*)#zz0PoPhS84 literal 0 HcmV?d00001 diff --git a/Manifest.toml b/Manifest.toml new file mode 100644 index 0000000..c704ecc --- /dev/null +++ b/Manifest.toml @@ -0,0 +1,188 @@ +# This file is machine-generated - editing it directly is not advised + +[[Arpack]] +deps = ["Arpack_jll", "Libdl", "LinearAlgebra"] +git-tree-sha1 = "2ff92b71ba1747c5fdd541f8fc87736d82f40ec9" +uuid = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97" +version = "0.4.0" + +[[Arpack_jll]] +deps = ["Libdl", "OpenBLAS_jll", "Pkg"] +git-tree-sha1 = "68a90a692ddc0eb72d69a6993ca26e2a923bf195" +uuid = "68821587-b530-5797-8361-c406ea357684" +version = "3.5.0+2" + +[[Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[BinaryProvider]] +deps = ["Libdl", "SHA"] +git-tree-sha1 = "5b08ed6036d9d3f0ee6369410b830f8873d4024c" +uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" +version = "0.5.8" + +[[DataAPI]] +git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252" +uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" +version = "1.1.0" + +[[DataStructures]] +deps = ["InteractiveUtils", "OrderedCollections"] +git-tree-sha1 = "5a431d46abf2ef2a4d5d00bd0ae61f651cf854c8" +uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" +version = "0.17.10" + +[[Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[Distributed]] +deps = ["Random", "Serialization", "Sockets"] +uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" + +[[Distributions]] +deps = ["FillArrays", "LinearAlgebra", "PDMats", "Printf", "QuadGK", "Random", "SpecialFunctions", "Statistics", "StatsBase", "StatsFuns"] +git-tree-sha1 = "6b19601c0e98de3a8964ed33ad73e130c7165b1d" +uuid = "31c24e10-a181-5473-b8eb-7969acd0382f" +version = "0.22.4" + +[[FillArrays]] +deps = ["LinearAlgebra", "Random", "SparseArrays"] +git-tree-sha1 = "85c6b57e2680fa28d5c8adc798967377646fbf66" +uuid = "1a297f60-69ca-5386-bcde-b61e274b549b" +version = "0.8.5" + +[[InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[LibGit2]] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[LinearAlgebra]] +deps = ["Libdl"] +uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[Missings]] +deps = ["DataAPI"] +git-tree-sha1 = "de0a5ce9e5289f27df672ffabef4d1e5861247d5" +uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" +version = "0.4.3" + +[[OpenBLAS_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "e2551d7c25d52f35b76d86a50917a3ba8988f519" +uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" +version = "0.3.7+5" + +[[OpenSpecFun_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "65f672edebf3f4e613ddf37db9dcbd7a407e5e90" +uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" +version = "0.5.3+1" + +[[OrderedCollections]] +deps = ["Random", "Serialization", "Test"] +git-tree-sha1 = "c4c13474d23c60d20a67b217f1d7f22a40edf8f1" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.1.0" + +[[PDMats]] +deps = ["Arpack", "LinearAlgebra", "SparseArrays", "SuiteSparse", "Test"] +git-tree-sha1 = "5f303510529486bb02ac4d70da8295da38302194" +uuid = "90014a1f-27ba-587c-ab20-58faa44d9150" +version = "0.9.11" + +[[Pkg]] +deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Test", "UUIDs"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[[Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[QuadGK]] +deps = ["DataStructures", "LinearAlgebra"] +git-tree-sha1 = "dc84e810393cfc6294248c9032a9cdacc14a3db4" +uuid = "1fd47b50-473d-5c70-9696-f719f8f3bcdc" +version = "2.3.1" + +[[REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[Random]] +deps = ["Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[Rmath]] +deps = ["BinaryProvider", "Libdl", "Random", "Statistics"] +git-tree-sha1 = "2bbddcb984a1d08612d0c4abb5b4774883f6fa98" +uuid = "79098fc4-a85e-5d69-aa6a-4863f24498fa" +version = "0.6.0" + +[[SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" + +[[Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[SortingAlgorithms]] +deps = ["DataStructures", "Random", "Test"] +git-tree-sha1 = "03f5898c9959f8115e30bc7226ada7d0df554ddd" +uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c" +version = "0.3.1" + +[[SparseArrays]] +deps = ["LinearAlgebra", "Random"] +uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + +[[SpecialFunctions]] +deps = ["OpenSpecFun_jll"] +git-tree-sha1 = "e19b98acb182567bcb7b75bb5d9eedf3a3b5ec6c" +uuid = "276daf66-3868-5448-9aa4-cd146d93841b" +version = "0.10.0" + +[[Statistics]] +deps = ["LinearAlgebra", "SparseArrays"] +uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + +[[StatsBase]] +deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics"] +git-tree-sha1 = "be5c7d45daa449d12868f4466dbf5882242cf2d9" +uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" +version = "0.32.1" + +[[StatsFuns]] +deps = ["Rmath", "SpecialFunctions"] +git-tree-sha1 = "f290ddd5fdedeadd10e961eb3f4d3340f09d030a" +uuid = "4c63d2b9-4356-54db-8cca-17b64c39e42c" +version = "0.9.4" + +[[SuiteSparse]] +deps = ["Libdl", "LinearAlgebra", "Serialization", "SparseArrays"] +uuid = "4607b0f0-06f3-5cda-b6b1-a6196a1729e9" + +[[Test]] +deps = ["Distributed", "InteractiveUtils", "Logging", "Random"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" diff --git a/Project.toml b/Project.toml new file mode 100644 index 0000000..65ba1d2 --- /dev/null +++ b/Project.toml @@ -0,0 +1,18 @@ +name = "TopicModels" +uuid = "e9825ca3-3499-4c9b-97dc-a93734876e50" +authors = ["Jonathan Chang "] +version = "0.1.0" + +[deps] +Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" + +[compat] +julia = "1.3" + +[extras] +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[targets] +test = ["Test"] diff --git a/REQUIRE b/REQUIRE deleted file mode 100644 index e69de29..0000000 diff --git a/src/TopicModels.jl b/src/TopicModels.jl index 38696f5..2990d26 100644 --- a/src/TopicModels.jl +++ b/src/TopicModels.jl @@ -2,9 +2,9 @@ module TopicModels import Base.length -typealias RaggedMatrix{T} Array{Array{T,1},1} +RaggedMatrix{T} = Array{Array{T,1},1} -type Corpus +struct Corpus documents::RaggedMatrix{Int64} weights::RaggedMatrix{Float64} @@ -15,7 +15,7 @@ type Corpus weights ) end - + Corpus(documents::RaggedMatrix{Int64}) = begin weights = map(documents) do doc ones(Float64, length(doc)) @@ -27,7 +27,7 @@ type Corpus end end -type Model +struct Model alphaPrior::Vector{Float64} betaPrior::Float64 topics::Array{Float64,2} @@ -37,9 +37,9 @@ type Model frozen::Bool corpus::Corpus - Model(alphaPrior::Vector{Float64}, - betaPrior::Float64, - V::Int64, + Model(alphaPrior::Vector{Float64}, + betaPrior::Float64, + V::Int64, corpus::Corpus) = begin K = length(alphaPrior) m = new( @@ -48,7 +48,7 @@ type Model zeros(Float64, K, V), # topics zeros(Float64, K), # topicSums zeros(Float64, K, length(corpus.documents)), #documentSums - fill(Array(Int64, 0), length(corpus.documents)), # assignments + Array{Array{Int64,1},1}(undef,length(corpus.documents)), # assignments false, corpus ) @@ -111,8 +111,8 @@ function wordDistribution(word::Int, out::Vector{Float64}) V = size(model.topics, 2) for ii in 1:length(out) - u = (model.documentSums[ii, document] + model.alphaPrior[ii]) * - (model.topics[ii, word] + model.betaPrior) / + u = (model.documentSums[ii, document] + model.alphaPrior[ii]) * + (model.topics[ii, word] + model.betaPrior) / (model.topicSums[ii] + V * model.betaPrior) @inbounds out[ii] = u end @@ -128,10 +128,10 @@ function sampleWord(word::Int, end -function updateSufficientStatistics(word::Int64, +function updateSufficientStatistics(word::Int64, topic::Int64, document::Int64, - scale::Float64, + scale::Float64, model::Model) fr = Float64(!model.frozen) @inbounds model.documentSums[topic, document] += scale @@ -146,7 +146,7 @@ function sampleDocument(document::Int, Nw = length(words) @inbounds weights = model.corpus.weights[document] K = length(model.alphaPrior) - p = Array(Float64, K) + p = Array{Float64,1}(undef,K) @inbounds assignments = model.assignments[document] for ii in 1:Nw @inbounds word = words[ii] @@ -170,10 +170,10 @@ end function termToWordSequence(term::AbstractString) parts = split(term, ":") fill(parse(Int64, parts[1]) + 1, parse(Int64, parts[2])) -end +end # The functions below are designed for public consumption -function trainModel(model::Model, +function trainModel(model::Model, numIterations::Int64) for ii in 1:numIterations println(string("Iteration ", ii, "...")) @@ -183,7 +183,7 @@ function trainModel(model::Model, end function topTopicWords(model::Model, - lexicon::Array{ASCIIString,1}, + lexicon::Array{String,1}, numWords::Int64) [lexicon[reverse(sortperm(model.topics'[1:end, row]))[1:numWords]] for row in 1:size(model.topics,1)] @@ -198,7 +198,7 @@ end function readLexicon(stream) lines = readlines(stream) - map(chomp, convert(Array{AbstractString,1}, lines)) + convert(Array{String,1},map(chomp, convert(Array{AbstractString,1}, lines))) end export Corpus,