bnosac
diff --git a/‎DESCRIPTION‎
Lines changed: 1 addition & 1 deletion b/‎DESCRIPTION‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎NAMESPACE‎
Lines changed: 1 addition & 0 deletions b/‎NAMESPACE‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎NEWS.md‎
Lines changed: 10 additions & 9 deletions b/‎NEWS.md‎
Lines changed: 10 additions & 9 deletions
diff --git a/‎R/paragraph2vec.R‎
Lines changed: 10 additions & 15 deletions b/‎R/paragraph2vec.R‎
Lines changed: 10 additions & 15 deletions
diff --git a/‎R/utils.R‎
Lines changed: 28 additions & 0 deletions b/‎R/utils.R‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎man/as.matrix.paragraph2vec.Rd‎
Lines changed: 2 additions & 3 deletions b/‎man/as.matrix.paragraph2vec.Rd‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎man/paragraph2vec.Rd‎
Lines changed: 2 additions & 3 deletions b/‎man/paragraph2vec.Rd‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎man/predict.paragraph2vec.Rd‎
Lines changed: 2 additions & 3 deletions b/‎man/predict.paragraph2vec.Rd‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎man/read.paragraph2vec.Rd‎
Lines changed: 2 additions & 3 deletions b/‎man/read.paragraph2vec.Rd‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎man/txt_count_words.Rd‎
Lines changed: 29 additions & 0 deletions b/‎man/txt_count_words.Rd‎
Lines changed: 29 additions & 0 deletions
@@ -18,4 +18,4 @@ RoxygenNote: 7.1.1
 Depends: R (>= 2.10)
 Imports: Rcpp (>= 0.11.5), stats
 LinkingTo: Rcpp
-Suggests: tokenizers.bpe, udpipe (>= 0.8.4)
+Suggests: tokenizers.bpe
@@ -9,6 +9,7 @@ S3method(summary,paragraph2vec_trained)
 export(paragraph2vec)
 export(paragraph2vec_similarity)
 export(read.paragraph2vec)
+export(txt_count_words)
 export(write.paragraph2vec)
 importFrom(Rcpp,evalCpp)
 importFrom(stats,predict)
 
@@ -1,9 +1,10 @@
-## CHANGES IN doc2vec VERSION 0.1.1
-
-- Fixes for valgrind R CMD checks 
-    - Fixes for destructors of Vocabulary
-    - Remove WMD
-
-## CHANGES IN doc2vec VERSION 0.1.0
-
-- Initial package based on https://github.com/hiyijian/doc2vec commit dec123e891f17ea664053ee7575b0e5e7dae4fca
+## CHANGES IN doc2vec VERSION 0.1.1
+
+- Fixes for valgrind R CMD checks 
+    - Fixes for destructors of Vocabulary
+    - Remove WMD
+- Added txt_count_words and removed Suggests dependency of udpipe
+
+## CHANGES IN doc2vec VERSION 0.1.0
+
+- Initial package based on https://github.com/hiyijian/doc2vec commit dec123e891f17ea664053ee7575b0e5e7dae4fca
@@ -36,9 +36,8 @@
 #' @seealso \code{\link{predict.paragraph2vec}}, \code{\link{as.matrix.paragraph2vec}}
 #' @export
 #' @examples
-#' \dontshow{if(require(tokenizers.bpe) & require(udpipe))\{}
+#' \dontshow{if(require(tokenizers.bpe))\{}
 #' library(tokenizers.bpe)
-#' library(udpipe)
 #' ## Take data and standardise it a bit
 #' data(belgium_parliament, package = "tokenizers.bpe")
 #' str(belgium_parliament)
@@ -47,7 +46,7 @@
 #' x$text   <- gsub("[^[:alpha:]]", " ", x$text)
 #' x$text   <- gsub("[[:space:]]+", " ", x$text)
 #' x$text   <- trimws(x$text)
-#' x$nwords <- txt_count(x$text, pattern = " ")
+#' x$nwords <- txt_count_words(x$text)
 #' x <- subset(x, nwords < 1000 & nchar(text) > 0)
 #' 
 #' ## Build the model
@@ -121,12 +120,11 @@ paragraph2vec <- function(x,
 #' @seealso \code{\link{paragraph2vec}}, \code{\link{read.paragraph2vec}}
 #' @export
 #' @examples 
-#' \dontshow{if(require(tokenizers.bpe) & require(udpipe))\{}
+#' \dontshow{if(require(tokenizers.bpe))\{}
 #' library(tokenizers.bpe)
-#' library(udpipe)
 #' data(belgium_parliament, package = "tokenizers.bpe")
 #' x <- subset(belgium_parliament, language %in% "french")
-#' x <- subset(x, nchar(text) > 0 & txt_count(text, pattern = " ") < 1000)
+#' x <- subset(x, nchar(text) > 0 & txt_count_words(text) < 1000)
 #' 
 #' model <- paragraph2vec(x = x, type = "PV-DM",   dim = 15,  iter = 5)
 #' \donttest{
@@ -162,12 +160,11 @@ as.matrix.paragraph2vec_trained <- function(x, encoding='UTF-8', ...){
 #' @export
 #' @seealso \code{\link{paragraph2vec}}
 #' @examples 
-#' \dontshow{if(require(tokenizers.bpe) & require(udpipe))\{}
+#' \dontshow{if(require(tokenizers.bpe))\{}
 #' library(tokenizers.bpe)
-#' library(udpipe)
 #' data(belgium_parliament, package = "tokenizers.bpe")
 #' x <- subset(belgium_parliament, language %in% "french")
-#' x <- subset(x, nchar(text) > 0 & txt_count(text, pattern = " ") < 1000)
+#' x <- subset(x, nchar(text) > 0 & txt_count_words(text) < 1000)
 #' 
 #' \donttest{
 #' model <- paragraph2vec(x = x, type = "PV-DM",   dim = 100, iter = 20)
@@ -209,12 +206,11 @@ write.paragraph2vec <- function(x, file){
 #' }
 #' @export
 #' @examples 
-#' \dontshow{if(require(tokenizers.bpe) & require(udpipe))\{}
+#' \dontshow{if(require(tokenizers.bpe))\{}
 #' library(tokenizers.bpe)
-#' library(udpipe)
 #' data(belgium_parliament, package = "tokenizers.bpe")
 #' x <- subset(belgium_parliament, language %in% "french")
-#' x <- subset(x, nchar(text) > 0 & txt_count(text, pattern = " ") < 1000)
+#' x <- subset(x, nchar(text) > 0 & txt_count_words(text) < 1000)
 #' 
 #' \donttest{
 #' model <- paragraph2vec(x = x, type = "PV-DM",   dim = 100, iter = 20)
@@ -299,13 +295,12 @@ summary.paragraph2vec_trained <- function(object, type = "vocabulary", which = c
 #' @seealso \code{\link{paragraph2vec}}, \code{\link{read.paragraph2vec}}
 #' @export
 #' @examples 
-#' \dontshow{if(require(tokenizers.bpe) & require(udpipe))\{}
+#' \dontshow{if(require(tokenizers.bpe))\{}
 #' library(tokenizers.bpe)
-#' library(udpipe)
 #' data(belgium_parliament, package = "tokenizers.bpe")
 #' x <- belgium_parliament
 #' x <- subset(x, language %in% "dutch")
-#' x <- subset(x, nchar(text) > 0 & txt_count(text, pattern = " ") < 1000)
+#' x <- subset(x, nchar(text) > 0 & txt_count_words(text) < 1000)
 #' x$doc_id <- sprintf("doc_%s", 1:nrow(x))
 #' x$text   <- tolower(x$text)
 #' x$text   <- gsub("[^[:alpha:]]", " ", x$text)
 
@@ -0,0 +1,28 @@
+#' @title Count the number of spaces occurring in text
+#' @description The C++ doc2vec functionalities in this package assume words are either separated 
+#' by a space or tab symbol and that each document contains less than 1000 words.\cr 
+#' This function calculates how many words there are in each element of a character vector by counting
+#' the number of occurrences of the space or tab symbol.
+#' @param x a character vector with text
+#' @param pattern a text pattern to count which might be contained in \code{x}. Defaults to either space or tab.
+#' @param ... other arguments, passed on to \code{\link{gregexpr}}
+#' @return an integer vector of the same length as \code{x} indicating how many times the pattern is occurring in \code{x}
+#' @export
+#' @examples 
+#' x <- c("Count me in.007", "this is a set  of words",
+#'        "more\texamples tabs-and-spaces.only", NA)
+#' txt_count_words(x)
+txt_count_words <- function(x, pattern = "[ \t]", ...){
+  result <- gregexpr(pattern = pattern, text = x, ...)
+  sapply(result, FUN = function(x){
+    test <- length(x) == 1 && x < 0
+    if(is.na(test)){
+      return(NA_integer_) 
+    }
+    if(test){
+      0L
+    }else{
+      length(x)
+    }
+  }, USE.NAMES = FALSE)
+}