|
6 | 6 | #' where an additional vector for every paragraph is added directly in the training. |
7 | 7 | #' @param x a data.frame with columns doc_id and text or the path to the file on disk containing training data.\cr |
8 | 8 | #' Note that the text column should be of type character, should contain less than 1000 words where space or tab is |
9 | | -#' used as a word separator and that the text should not contain newline characters as these are considered document delimiters. |
| 9 | +#' used as a word separator and that the text should not contain newline characters as these are considered document delimiters.\cr |
| 10 | +#' The doc_id should not contain spaces. |
10 | 11 | #' @param type character string with the type of algorithm to use, either one of |
11 | 12 | #' \itemize{ |
12 | 13 | #' \item{'PV-DM': Distributed Memory paragraph vectors} |
@@ -99,6 +100,17 @@ paragraph2vec <- function(x, |
99 | 100 | file_train <- x |
100 | 101 | }else{ |
101 | 102 | stopifnot(is.data.frame(x) && all(c("doc_id", "text") %in% colnames(x))) |
| 103 | + nwords <- txt_count_words(x$text, pattern = "[ \t]") |
| 104 | + idx <- which(nwords >= 1000) |
| 105 | + if(length(idx) > 0){ |
| 106 | + message(sprintf("Note: there are texts which are longer than 1000 words, for these we will take only the first 1000 words, example doc_id: %s", x$doc_id[sample(idx, size = 1)])) |
| 107 | + x$text[idx] <- sapply(strsplit(x$text[idx], split = "[ \t]"), FUN = function(x) paste(head(x, n = 1000), collapse = " ")) |
| 108 | + } |
| 109 | + idx <- grepl(x$doc_id, pattern = "[ \t]") |
| 110 | + idx <- which(idx) |
| 111 | + if(length(idx) > 0){ |
| 112 | + warning(sprintf("There are doc_id's containing spaces, make sure your doc_id has no spaces otherwise the doc_id will be everything before the space and the remainder will be a word which is considered part of the document, e.g look at doc_id: %s", x$doc_id[sample(idx, size = 1)])) |
| 113 | + } |
102 | 114 | file_train <- tempfile(pattern = "textspace_", fileext = ".txt") |
103 | 115 | on.exit({ |
104 | 116 | if (file.exists(file_train)) file.remove(file_train) |
|
0 commit comments