|
36 | 36 | #' @seealso \code{\link{predict.paragraph2vec}}, \code{\link{as.matrix.paragraph2vec}} |
37 | 37 | #' @export |
38 | 38 | #' @examples |
39 | | -#' \dontshow{if(require(tokenizers.bpe) & require(udpipe))\{} |
| 39 | +#' \dontshow{if(require(tokenizers.bpe))\{} |
40 | 40 | #' library(tokenizers.bpe) |
41 | | -#' library(udpipe) |
42 | 41 | #' ## Take data and standardise it a bit |
43 | 42 | #' data(belgium_parliament, package = "tokenizers.bpe") |
44 | 43 | #' str(belgium_parliament) |
|
47 | 46 | #' x$text <- gsub("[^[:alpha:]]", " ", x$text) |
48 | 47 | #' x$text <- gsub("[[:space:]]+", " ", x$text) |
49 | 48 | #' x$text <- trimws(x$text) |
50 | | -#' x$nwords <- txt_count(x$text, pattern = " ") |
| 49 | +#' x$nwords <- txt_count_words(x$text) |
51 | 50 | #' x <- subset(x, nwords < 1000 & nchar(text) > 0) |
52 | 51 | #' |
53 | 52 | #' ## Build the model |
@@ -121,12 +120,11 @@ paragraph2vec <- function(x, |
121 | 120 | #' @seealso \code{\link{paragraph2vec}}, \code{\link{read.paragraph2vec}} |
122 | 121 | #' @export |
123 | 122 | #' @examples |
124 | | -#' \dontshow{if(require(tokenizers.bpe) & require(udpipe))\{} |
| 123 | +#' \dontshow{if(require(tokenizers.bpe))\{} |
125 | 124 | #' library(tokenizers.bpe) |
126 | | -#' library(udpipe) |
127 | 125 | #' data(belgium_parliament, package = "tokenizers.bpe") |
128 | 126 | #' x <- subset(belgium_parliament, language %in% "french") |
129 | | -#' x <- subset(x, nchar(text) > 0 & txt_count(text, pattern = " ") < 1000) |
| 127 | +#' x <- subset(x, nchar(text) > 0 & txt_count_words(text) < 1000) |
130 | 128 | #' |
131 | 129 | #' model <- paragraph2vec(x = x, type = "PV-DM", dim = 15, iter = 5) |
132 | 130 | #' \donttest{ |
@@ -162,12 +160,11 @@ as.matrix.paragraph2vec_trained <- function(x, encoding='UTF-8', ...){ |
162 | 160 | #' @export |
163 | 161 | #' @seealso \code{\link{paragraph2vec}} |
164 | 162 | #' @examples |
165 | | -#' \dontshow{if(require(tokenizers.bpe) & require(udpipe))\{} |
| 163 | +#' \dontshow{if(require(tokenizers.bpe))\{} |
166 | 164 | #' library(tokenizers.bpe) |
167 | | -#' library(udpipe) |
168 | 165 | #' data(belgium_parliament, package = "tokenizers.bpe") |
169 | 166 | #' x <- subset(belgium_parliament, language %in% "french") |
170 | | -#' x <- subset(x, nchar(text) > 0 & txt_count(text, pattern = " ") < 1000) |
| 167 | +#' x <- subset(x, nchar(text) > 0 & txt_count_words(text) < 1000) |
171 | 168 | #' |
172 | 169 | #' \donttest{ |
173 | 170 | #' model <- paragraph2vec(x = x, type = "PV-DM", dim = 100, iter = 20) |
@@ -209,12 +206,11 @@ write.paragraph2vec <- function(x, file){ |
209 | 206 | #' } |
210 | 207 | #' @export |
211 | 208 | #' @examples |
212 | | -#' \dontshow{if(require(tokenizers.bpe) & require(udpipe))\{} |
| 209 | +#' \dontshow{if(require(tokenizers.bpe))\{} |
213 | 210 | #' library(tokenizers.bpe) |
214 | | -#' library(udpipe) |
215 | 211 | #' data(belgium_parliament, package = "tokenizers.bpe") |
216 | 212 | #' x <- subset(belgium_parliament, language %in% "french") |
217 | | -#' x <- subset(x, nchar(text) > 0 & txt_count(text, pattern = " ") < 1000) |
| 213 | +#' x <- subset(x, nchar(text) > 0 & txt_count_words(text) < 1000) |
218 | 214 | #' |
219 | 215 | #' \donttest{ |
220 | 216 | #' model <- paragraph2vec(x = x, type = "PV-DM", dim = 100, iter = 20) |
@@ -299,13 +295,12 @@ summary.paragraph2vec_trained <- function(object, type = "vocabulary", which = c |
299 | 295 | #' @seealso \code{\link{paragraph2vec}}, \code{\link{read.paragraph2vec}} |
300 | 296 | #' @export |
301 | 297 | #' @examples |
302 | | -#' \dontshow{if(require(tokenizers.bpe) & require(udpipe))\{} |
| 298 | +#' \dontshow{if(require(tokenizers.bpe))\{} |
303 | 299 | #' library(tokenizers.bpe) |
304 | | -#' library(udpipe) |
305 | 300 | #' data(belgium_parliament, package = "tokenizers.bpe") |
306 | 301 | #' x <- belgium_parliament |
307 | 302 | #' x <- subset(x, language %in% "dutch") |
308 | | -#' x <- subset(x, nchar(text) > 0 & txt_count(text, pattern = " ") < 1000) |
| 303 | +#' x <- subset(x, nchar(text) > 0 & txt_count_words(text) < 1000) |
309 | 304 | #' x$doc_id <- sprintf("doc_%s", 1:nrow(x)) |
310 | 305 | #' x$text <- tolower(x$text) |
311 | 306 | #' x$text <- gsub("[^[:alpha:]]", " ", x$text) |
|
0 commit comments