## -----------------------------------------------------------------------------
library(dplyr)
library(tidyr)
library(tibble)
library(purrr)
library(stringr)
library(ggplot2)
library(lpSolve) # For matching fitted and trust topics
library(tictoc) # For checking timing

library(tmfast)
library(stm) # Used for comparison
library(tidytext) # Used for stm tidiers

## -----------------------------------------------------------------------------
k = 10 # Num. topics / journals
Mj = 100 # Num. documents per journal
M = Mj * k # Total corpus size
vocab = M # Vocabulary length

## Negative binomial distribution of doc lengths
size = 10 # Size and mean
mu = 300
sqrt(mu + mu^2 / size) # Resulting SD of document sizes

## Dirichlet distributions for topic-docs and word-topics
topic_peak = .8
topic_scale = 10

word_beta = 0.1

## -----------------------------------------------------------------------------
set.seed(2022 - 06 - 19)

## -----------------------------------------------------------------------------
## Journal-specific alpha, with a peak value (.8 by default) and uniform otherwise
theta = map(
      1:k,
      ~ rdirichlet(
            Mj,
            peak_alpha(k, .x, peak = topic_peak, scale = topic_scale)
      )
) |>
      reduce(rbind)

theta_df = theta |>
      as_tibble(rownames = 'doc', .name_repair = tmfast:::make_colnames) |>
      mutate(doc = as.integer(doc)) |>
      pivot_longer(starts_with('V'), names_to = 'topic', values_to = 'prob')

ggplot(theta_df, aes(doc, topic, fill = prob)) +
      geom_tile()

## -----------------------------------------------------------------------------
## phi_j:  Word distribution for topic j
phi = rdirichlet(k, word_beta, k = vocab)

## Word distributions
phi |>
      as_tibble(rownames = 'topic', .name_repair = tmfast:::make_colnames) |>
      pivot_longer(starts_with('V'), names_to = 'word', values_to = 'prob') |>
      ggplot(aes(topic, word, fill = (prob))) +
      geom_tile() +
      scale_y_discrete(breaks = NULL)

## Zipf's law
phi |>
      as_tibble(rownames = 'topic', .name_repair = \(x) {
            (str_c('word', 1:vocab))
      }) |>
      pivot_longer(
            starts_with('word'),
            names_to = 'word',
            values_to = 'prob'
      ) |>
      group_by(topic) |>
      mutate(rank = rank(desc(prob))) |>
      arrange(topic, rank) |>
      filter(rank < vocab / 2) |>
      ggplot(aes(rank, prob, color = topic)) +
      geom_line() +
      scale_x_log10() +
      scale_y_log10()

## -----------------------------------------------------------------------------
## N_i:  Length of document i
N = rnbinom(M, size = size, mu = mu)
summary(N)
sd(N)
hist(N)

## ----cache = TRUE-------------------------------------------------------------
tic()
corpus = draw_corpus(N, theta, phi)
toc()
dtm = mutate(corpus, n = log1p(n))

## -----------------------------------------------------------------------------
tic()
fitted = tmfast(dtm, c(2, 3, k, 2 * k))
toc()

## -----------------------------------------------------------------------------
str(fitted, max.level = 2L)
str(fitted$varimax$`10`)

## -----------------------------------------------------------------------------
screeplot(fitted)

## -----------------------------------------------------------------------------
## Variance coverage?
cumsum(fitted$sdev^2) / fitted$totalvar

data.frame(
      PC = 1:length(fitted$sdev),
      cum_var = cumsum(fitted$sdev^2) / fitted$totalvar
) |>
      ggplot(aes(PC, cum_var)) +
      geom_line() +
      geom_point()

## ----eval = FALSE-------------------------------------------------------------
# tic()
# corpus |>
#       cast_sparse(doc, word, n) |>
#       stm(K = 0, verbose = FALSE)
# toc()

## -----------------------------------------------------------------------------
tic()
fitted_stm = corpus |>
      cast_sparse(doc, word, n) |>
      stm(K = k, verbose = FALSE)
toc()

## -----------------------------------------------------------------------------
## beta: fitted varimax loadings, transformed to probability distributions
beta = tidy(fitted, k, 'beta')

## -----------------------------------------------------------------------------
## For convenience, arrange phi in a tidy-like format
phi_tidy = phi |>
      t() |>
      as_tibble(
            rownames = 'token',
            .name_repair = tmfast:::make_colnames
      ) |>
      pivot_longer(
            starts_with('V'),
            names_to = 'topic',
            values_to = 'beta'
      ) |>
      mutate(beta_rn = beta) |>
      mutate(type = 'true')

## Compare Zipfian distributions
bind_rows(
      mutate(beta, type = 'fitted'),
      phi_tidy
) |>
      group_by(type, topic) |>
      mutate(rank = rank(desc(beta))) |>
      arrange(type, topic, rank) |>
      filter(rank < vocab / 2) |>
      ggplot(aes(rank, beta, color = type, group = interaction(topic, type))) +
      geom_line() +
      scale_y_log10() +
      scale_x_log10()

## -----------------------------------------------------------------------------
phi_tidy |>
      group_by(topic) |>
      summarize(entropy = entropy(beta))

beta |>
      group_by(topic) |>
      summarize(entropy = entropy(beta))

## -----------------------------------------------------------------------------
expected_entropy(word_beta, k = vocab)

## -----------------------------------------------------------------------------
beta_power = target_power(
      tidy_df = beta,
      group_col = topic,
      p_col = beta,
      target_entropy = expected_entropy(word_beta, k = vocab)
)
beta_power

## -----------------------------------------------------------------------------
## Renormalized beta
beta_rn = tidy(fitted, k, 'beta', exponent = beta_power)

## Entropies after renormalization
beta_rn |>
      group_by(topic) |>
      summarize(entropy = entropy(beta))

## Compare Zipfian distributions
bind_rows(
      mutate(beta_rn, type = 'fitted'),
      phi_tidy
) |>
      group_by(type, topic) |>
      mutate(rank = rank(desc(beta))) |>
      arrange(type, topic, rank) |>
      filter(rank < vocab / 2) |>
      ggplot(aes(rank, beta, color = type, group = interaction(topic, type))) +
      geom_line() +
      scale_y_log10() +
      scale_x_log10()

## -----------------------------------------------------------------------------
## Hellinger distance of word-topic distributions
beta_mx = beta_rn |>
      ## Fix order of words
      mutate(token = as.integer(token)) |>
      arrange(token) |>
      ## And dropped words
      complete(token = 1:vocab, topic, fill = list(beta = 0)) |>
      pivot_wider(
            names_from = 'topic',
            values_from = 'beta',
            values_fill = 0,
            names_sort = TRUE
      ) |>
      # select(-`NA`) |>
      ## Coerce to matrix
      column_to_rownames('token') |>
      as.matrix()

hellinger(phi, t(beta_mx))

## -----------------------------------------------------------------------------
## Use lpSolve to match fitted topics to true topics
dist = hellinger(phi, t(beta_mx))
soln = lp.assign(dist)
soln$solution

hellinger(phi, soln$solution %*% t(beta_mx))
hellinger(phi, soln$solution %*% t(beta_mx)) |>
      diag() |>
      summary()

## -----------------------------------------------------------------------------
beta_stm_mx = tidy(fitted_stm, matrix = 'beta') |>
      ## Fix order of words
      mutate(term = as.integer(term)) |>
      arrange(term) |>
      ## And dropped words
      complete(term = 1:vocab, topic, fill = list(beta = 0)) |>
      pivot_wider(
            names_from = 'topic',
            values_from = 'beta',
            values_fill = 0,
            names_sort = TRUE
      ) |>
      # select(-`NA`) |>
      ## Coerce to matrix
      column_to_rownames('term') |>
      as.matrix()

hellinger(phi, t(beta_stm_mx))

rotation_stm = hellinger(phi, t(beta_stm_mx)) |>
      lp.assign() |>
      magrittr::extract2('solution')

hellinger(phi, rotation_stm %*% t(beta_stm_mx)) |>
      diag() |>
      summary()

## -----------------------------------------------------------------------------
## Example Dirichlet distribution used in drawing the true topic-document distributions
peak_alpha(k, 1, topic_peak, topic_scale)
expected_entropy(peak_alpha(k, 1, topic_peak, topic_scale))

## Renormalization exponent
gamma_power = tidy(fitted, k, 'gamma') |>
      target_power(
            document,
            gamma,
            expected_entropy(peak_alpha(k, 1, topic_peak, topic_scale))
      )
gamma_power

gamma_df = tidy(
      fitted,
      k,
      'gamma',
      rotation = soln$solution,
      exponent = gamma_power
)

gamma_df

## -----------------------------------------------------------------------------
gamma_df |>
      mutate(document = as.integer(document)) |>
      ggplot(aes(document, topic, fill = gamma)) +
      geom_raster() +
      scale_x_continuous(breaks = NULL)

gamma_df |>
      mutate(
            document = as.integer(document),
            journal = (document - 1) %/% Mj + 1
      ) |>
      ggplot(aes(topic, gamma, group = document, color = as.factor(journal))) +
      geom_line(alpha = .25) +
      facet_wrap(vars(journal), scales = 'free_x') +
      scale_color_discrete(guide = 'none')

## -----------------------------------------------------------------------------
hellinger(
      theta_df,
      id1 = doc,
      topics2 = gamma_df,
      id2 = document,
      prob2 = gamma,
      df = FALSE
) |>
      diag() |>
      summary()

## -----------------------------------------------------------------------------
fitted_stm_gamma = tidy(fitted_stm, matrix = 'gamma') |>
      pivot_wider(names_from = 'topic', values_from = 'gamma') |>
      column_to_rownames('document') |>
      as.matrix()

hellinger(theta, fitted_stm_gamma %*% t(rotation_stm)) |>
      diag() |>
      summary()

## -----------------------------------------------------------------------------
tsne(fitted, k) |>
      mutate(journal = (as.integer(document) - 1) %/% Mj + 1) |>
      ggplot(aes(x, y, color = as.character(journal))) +
      geom_point() +
      labs(title = 't-SNE visualization')

umap(fitted, k, df = TRUE) |>
      mutate(journal = (as.integer(document) - 1) %/% Mj + 1) |>
      ggplot(aes(x, y, color = as.character(journal))) +
      geom_point() +
      labs(title = 'UMAP visualization')