Skip to content

Commit 6005d1b

Browse files
committed
Added txt_count_words and removed Suggests dependency of udpipe
1 parent 093193a commit 6005d1b

11 files changed

Lines changed: 89 additions & 40 deletions

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,4 @@ RoxygenNote: 7.1.1
1818
Depends: R (>= 2.10)
1919
Imports: Rcpp (>= 0.11.5), stats
2020
LinkingTo: Rcpp
21-
Suggests: tokenizers.bpe, udpipe (>= 0.8.4)
21+
Suggests: tokenizers.bpe

NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ S3method(summary,paragraph2vec_trained)
99
export(paragraph2vec)
1010
export(paragraph2vec_similarity)
1111
export(read.paragraph2vec)
12+
export(txt_count_words)
1213
export(write.paragraph2vec)
1314
importFrom(Rcpp,evalCpp)
1415
importFrom(stats,predict)

NEWS.md

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
## CHANGES IN doc2vec VERSION 0.1.1
2-
3-
- Fixes for valgrind R CMD checks
4-
- Fixes for destructors of Vocabulary
5-
- Remove WMD
6-
7-
## CHANGES IN doc2vec VERSION 0.1.0
8-
9-
- Initial package based on https://github.com/hiyijian/doc2vec commit dec123e891f17ea664053ee7575b0e5e7dae4fca
1+
## CHANGES IN doc2vec VERSION 0.1.1
2+
3+
- Fixes for valgrind R CMD checks
4+
- Fixes for destructors of Vocabulary
5+
- Remove WMD
6+
- Added txt_count_words and removed Suggests dependency of udpipe
7+
8+
## CHANGES IN doc2vec VERSION 0.1.0
9+
10+
- Initial package based on https://github.com/hiyijian/doc2vec commit dec123e891f17ea664053ee7575b0e5e7dae4fca

R/paragraph2vec.R

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,8 @@
3636
#' @seealso \code{\link{predict.paragraph2vec}}, \code{\link{as.matrix.paragraph2vec}}
3737
#' @export
3838
#' @examples
39-
#' \dontshow{if(require(tokenizers.bpe) & require(udpipe))\{}
39+
#' \dontshow{if(require(tokenizers.bpe))\{}
4040
#' library(tokenizers.bpe)
41-
#' library(udpipe)
4241
#' ## Take data and standardise it a bit
4342
#' data(belgium_parliament, package = "tokenizers.bpe")
4443
#' str(belgium_parliament)
@@ -47,7 +46,7 @@
4746
#' x$text <- gsub("[^[:alpha:]]", " ", x$text)
4847
#' x$text <- gsub("[[:space:]]+", " ", x$text)
4948
#' x$text <- trimws(x$text)
50-
#' x$nwords <- txt_count(x$text, pattern = " ")
49+
#' x$nwords <- txt_count_words(x$text)
5150
#' x <- subset(x, nwords < 1000 & nchar(text) > 0)
5251
#'
5352
#' ## Build the model
@@ -121,12 +120,11 @@ paragraph2vec <- function(x,
121120
#' @seealso \code{\link{paragraph2vec}}, \code{\link{read.paragraph2vec}}
122121
#' @export
123122
#' @examples
124-
#' \dontshow{if(require(tokenizers.bpe) & require(udpipe))\{}
123+
#' \dontshow{if(require(tokenizers.bpe))\{}
125124
#' library(tokenizers.bpe)
126-
#' library(udpipe)
127125
#' data(belgium_parliament, package = "tokenizers.bpe")
128126
#' x <- subset(belgium_parliament, language %in% "french")
129-
#' x <- subset(x, nchar(text) > 0 & txt_count(text, pattern = " ") < 1000)
127+
#' x <- subset(x, nchar(text) > 0 & txt_count_words(text) < 1000)
130128
#'
131129
#' model <- paragraph2vec(x = x, type = "PV-DM", dim = 15, iter = 5)
132130
#' \donttest{
@@ -162,12 +160,11 @@ as.matrix.paragraph2vec_trained <- function(x, encoding='UTF-8', ...){
162160
#' @export
163161
#' @seealso \code{\link{paragraph2vec}}
164162
#' @examples
165-
#' \dontshow{if(require(tokenizers.bpe) & require(udpipe))\{}
163+
#' \dontshow{if(require(tokenizers.bpe))\{}
166164
#' library(tokenizers.bpe)
167-
#' library(udpipe)
168165
#' data(belgium_parliament, package = "tokenizers.bpe")
169166
#' x <- subset(belgium_parliament, language %in% "french")
170-
#' x <- subset(x, nchar(text) > 0 & txt_count(text, pattern = " ") < 1000)
167+
#' x <- subset(x, nchar(text) > 0 & txt_count_words(text) < 1000)
171168
#'
172169
#' \donttest{
173170
#' model <- paragraph2vec(x = x, type = "PV-DM", dim = 100, iter = 20)
@@ -209,12 +206,11 @@ write.paragraph2vec <- function(x, file){
209206
#' }
210207
#' @export
211208
#' @examples
212-
#' \dontshow{if(require(tokenizers.bpe) & require(udpipe))\{}
209+
#' \dontshow{if(require(tokenizers.bpe))\{}
213210
#' library(tokenizers.bpe)
214-
#' library(udpipe)
215211
#' data(belgium_parliament, package = "tokenizers.bpe")
216212
#' x <- subset(belgium_parliament, language %in% "french")
217-
#' x <- subset(x, nchar(text) > 0 & txt_count(text, pattern = " ") < 1000)
213+
#' x <- subset(x, nchar(text) > 0 & txt_count_words(text) < 1000)
218214
#'
219215
#' \donttest{
220216
#' model <- paragraph2vec(x = x, type = "PV-DM", dim = 100, iter = 20)
@@ -299,13 +295,12 @@ summary.paragraph2vec_trained <- function(object, type = "vocabulary", which = c
299295
#' @seealso \code{\link{paragraph2vec}}, \code{\link{read.paragraph2vec}}
300296
#' @export
301297
#' @examples
302-
#' \dontshow{if(require(tokenizers.bpe) & require(udpipe))\{}
298+
#' \dontshow{if(require(tokenizers.bpe))\{}
303299
#' library(tokenizers.bpe)
304-
#' library(udpipe)
305300
#' data(belgium_parliament, package = "tokenizers.bpe")
306301
#' x <- belgium_parliament
307302
#' x <- subset(x, language %in% "dutch")
308-
#' x <- subset(x, nchar(text) > 0 & txt_count(text, pattern = " ") < 1000)
303+
#' x <- subset(x, nchar(text) > 0 & txt_count_words(text) < 1000)
309304
#' x$doc_id <- sprintf("doc_%s", 1:nrow(x))
310305
#' x$text <- tolower(x$text)
311306
#' x$text <- gsub("[^[:alpha:]]", " ", x$text)

R/utils.R

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#' @title Count the number of spaces occurring in text
2+
#' @description The C++ doc2vec functionalities in this package assume words are either separated
3+
#' by a space or tab symbol and that each document contains less than 1000 words.\cr
4+
#' This function calculates how many words there are in each element of a character vector by counting
5+
#' the number of occurrences of the space or tab symbol.
6+
#' @param x a character vector with text
7+
#' @param pattern a text pattern to count which might be contained in \code{x}. Defaults to either space or tab.
8+
#' @param ... other arguments, passed on to \code{\link{gregexpr}}
9+
#' @return an integer vector of the same length as \code{x} indicating how many times the pattern is occurring in \code{x}
10+
#' @export
11+
#' @examples
12+
#' x <- c("Count me in.007", "this is a set of words",
13+
#' "more\texamples tabs-and-spaces.only", NA)
14+
#' txt_count_words(x)
15+
txt_count_words <- function(x, pattern = "[ \t]", ...){
16+
result <- gregexpr(pattern = pattern, text = x, ...)
17+
sapply(result, FUN = function(x){
18+
test <- length(x) == 1 && x < 0
19+
if(is.na(test)){
20+
return(NA_integer_)
21+
}
22+
if(test){
23+
0L
24+
}else{
25+
length(x)
26+
}
27+
}, USE.NAMES = FALSE)
28+
}

man/as.matrix.paragraph2vec.Rd

Lines changed: 2 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/paragraph2vec.Rd

Lines changed: 2 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/predict.paragraph2vec.Rd

Lines changed: 2 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/read.paragraph2vec.Rd

Lines changed: 2 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/txt_count_words.Rd

Lines changed: 29 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)