|
88 | 88 | #' ## with unrealistic hyperparameter settings especially regarding dim / iter / n_epochs |
89 | 89 | #' ## in order to have a basic example finishing < 5 secs |
90 | 90 | #' ## |
| 91 | +#' \dontshow{if(require(word2vec) && require(uwot) && require(dbscan))\{} |
91 | 92 | #' library(uwot) |
92 | 93 | #' library(dbscan) |
| 94 | +#' library(word2vec) |
93 | 95 | #' data(be_parliament_2020, package = "doc2vec") |
94 | 96 | #' x <- data.frame(doc_id = be_parliament_2020$doc_id, |
95 | 97 | #' text = be_parliament_2020$text_nl, |
96 | 98 | #' stringsAsFactors = FALSE) |
97 | | -#' x <- head(x, 500) |
| 99 | +#' x <- head(x, 1000) |
| 100 | +#' x$text <- txt_clean_word2vec(x$text) |
| 101 | +#' x <- subset(x, txt_count_words(text) < 1000) |
98 | 102 | #' d2v <- paragraph2vec(x, type = "PV-DBOW", dim = 10, |
99 | 103 | #' lr = 0.05, iter = 0, |
100 | 104 | #' window = 5, hs = TRUE, negative = 0, |
|
104 | 108 | #' model <- top2vec(emb, |
105 | 109 | #' data = x, |
106 | 110 | #' control.dbscan = list(minPts = 50), |
107 | | -#' control.umap = list(n_neighbors = 5, n_components = 2, |
108 | | -#' n_epochs = 0, init = "spectral"), |
| 111 | +#' control.umap = list(n_neighbors = 15, n_components = 2, |
| 112 | +#' init = "spectral"), |
109 | 113 | #' umap = tumap, trace = TRUE) |
110 | 114 | #' info <- summary(model, top_n = 7) |
111 | 115 | #' print(info, top_n = c(5, 2)) |
| 116 | +#' \dontshow{\} # End of main if statement running only if the required packages are installed} |
112 | 117 | top2vec <- function(x, |
113 | 118 | data = data.frame(doc_id = character(), text = character(), stringsAsFactors = FALSE), |
114 | 119 | control.umap = list(n_neighbors = 15L, n_components = 5L, metric = "cosine"), |
@@ -173,6 +178,13 @@ top2vec <- function(x, |
173 | 178 | out |
174 | 179 | } |
175 | 180 |
|
| 181 | +#' @export |
| 182 | +print.top2vec <- function(x, ...){ |
| 183 | + cat(sprintf("Top2vec model trained on %s documents", nrow(x$embedding$docs)), sep = "\n") |
| 184 | + cat(sprintf(" number of topics: %s", x$k), sep = "\n") |
| 185 | + cat(sprintf(" topic distribution: %s", paste(round(prop.table(x$size), 2), collapse = " ")), sep = "\n") |
| 186 | +} |
| 187 | + |
176 | 188 |
|
177 | 189 | #' @title Update a Top2vec model |
178 | 190 | #' @description Update a Top2vec model by updating the UMAP dimension reduction together with the HDBSCAN clustering |
|
0 commit comments