Skip to content

Commit 8b2c84b

Browse files
committed
fix arrays
1 parent c0d3ac4 commit 8b2c84b

3 files changed

Lines changed: 51 additions & 52 deletions

File tree

README.md

Lines changed: 47 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ This repository contains an R package allowing to build `Paragraph Vector` model
77
- The package allows one
88
- to train paragraph embeddings (also known as document embeddings) on character data or data in a text file
99
- use the embeddings to find similar documents, paragraphs, sentences or words
10-
- Note. For getting word vectors in R: look at package https://github.com/bnosac/word2vec, details [here](https://www.bnosac.be/index.php/blog/100-word2vec-in-r), for Starspace embeddings: look at package https://github.com/bnosac/ruimtehol, details [here](https://cran.r-project.org/web/packages/ruimtehol/vignettes/ground-control-to-ruimtehol.pdf)
10+
- Note. For getting word vectors in R: look at package https://github.com/bnosac/word2vec, details [here](https://www.bnosac.be/index.php/blog/100-word2vec-in-r), for Starspace embeddings: look at package https://github.com/bnosac/ruimtehol, details [here](https://CRAN.R-project.org/package=ruimtehol/vignettes/ground-control-to-ruimtehol.pdf)
1111

1212
## Installation
1313

@@ -61,7 +61,7 @@ str(model)
6161
## List of 3
6262
## $ model :<externalptr>
6363
## $ data :List of 4
64-
## ..$ file : chr "C:\\Users\\Jan\\AppData\\Local\\Temp\\Rtmpk9Npjg\\textspace_1c4432666686.txt"
64+
## ..$ file : chr "C:\\Users\\Jan\\AppData\\Local\\Temp\\Rtmpk9Npjg\\textspace_1c446bffa0e.txt"
6565
## ..$ n : num 170469
6666
## ..$ n_vocabulary: num 3867
6767
## ..$ n_docs : num 1000
@@ -117,10 +117,10 @@ embedding[, 1:4]
117117
```
118118

119119
```
120-
## [,1] [,2] [,3] [,4]
121-
## doc_1 0.038523957 -0.14341952 -0.06087392 -0.01625664
122-
## doc_10 0.003298676 -0.04789201 0.06048679 -0.14829759
123-
## doc_3 0.030986091 0.08946659 0.02453904 -0.01900235
120+
## [,1] [,2] [,3] [,4]
121+
## doc_1 0.05721277 -0.10298843 0.1089350 -0.03075439
122+
## doc_10 0.09553983 0.05211980 -0.0513489 -0.11847925
123+
## doc_3 0.08008177 -0.03324692 0.1563442 0.06585038
124124
```
125125

126126
- Get similar documents or words when providing sentences, documents or words
@@ -134,19 +134,19 @@ nn
134134
```
135135
## [[1]]
136136
## term1 term2 similarity rank
137-
## 1 proximus telefoontoestellen 0.5364115 1
138-
## 2 proximus belfius 0.5292925 2
139-
## 3 proximus internetverbinding 0.5140554 3
140-
## 4 proximus ceo 0.4961080 4
141-
## 5 proximus fusie 0.4803250 5
137+
## 1 proximus telefoontoestellen 0.5357178 1
138+
## 2 proximus belfius 0.5169221 2
139+
## 3 proximus ceo 0.4839031 3
140+
## 4 proximus klanten 0.4819543 4
141+
## 5 proximus taal 0.4590944 5
142142
##
143143
## [[2]]
144-
## term1 term2 similarity rank
145-
## 1 koning ministerie 0.5567209 1
146-
## 2 koning verplaatsingen 0.5317563 2
147-
## 3 koning grondwet 0.5118545 3
148-
## 4 koning gedragen 0.4884593 4
149-
## 5 koning verantwoordelijk 0.4788159 5
144+
## term1 term2 similarity rank
145+
## 1 koning ministerie 0.5615162 1
146+
## 2 koning verplaatsingen 0.5484987 2
147+
## 3 koning familie 0.4911003 3
148+
## 4 koning grondwet 0.4871097 4
149+
## 5 koning gedragen 0.4694150 5
150150
```
151151

152152
```r
@@ -157,19 +157,19 @@ nn
157157
```
158158
## [[1]]
159159
## term1 term2 similarity rank
160-
## 1 proximus doc_105 0.7080573 1
161-
## 2 proximus doc_863 0.6275553 2
162-
## 3 proximus doc_186 0.5301130 3
163-
## 4 proximus doc_862 0.4656175 4
164-
## 5 proximus doc_620 0.4396312 5
160+
## 1 proximus doc_105 0.6684639 1
161+
## 2 proximus doc_863 0.5917463 2
162+
## 3 proximus doc_186 0.5233522 3
163+
## 4 proximus doc_620 0.4919243 4
164+
## 5 proximus doc_862 0.4619178 5
165165
##
166166
## [[2]]
167167
## term1 term2 similarity rank
168-
## 1 koning doc_44 0.6395732 1
169-
## 2 koning doc_583 0.5574296 2
170-
## 3 koning doc_45 0.5361990 3
171-
## 4 koning doc_943 0.4225507 4
172-
## 5 koning doc_797 0.4086391 5
168+
## 1 koning doc_44 0.6686417 1
169+
## 2 koning doc_45 0.5616031 2
170+
## 3 koning doc_583 0.5379452 3
171+
## 4 koning doc_943 0.4855201 4
172+
## 5 koning doc_797 0.4573555 5
173173
```
174174

175175
```r
@@ -180,19 +180,19 @@ nn
180180
```
181181
## [[1]]
182182
## term1 term2 similarity rank
183-
## 1 doc_198 doc_343 0.4947847 1
184-
## 2 doc_198 doc_899 0.4893836 2
185-
## 3 doc_198 doc_923 0.4850165 3
186-
## 4 doc_198 doc_708 0.4697377 4
187-
## 5 doc_198 doc_642 0.4622465 5
183+
## 1 doc_198 doc_343 0.5522854 1
184+
## 2 doc_198 doc_899 0.4902798 2
185+
## 3 doc_198 doc_983 0.4847047 3
186+
## 4 doc_198 doc_642 0.4829021 4
187+
## 5 doc_198 doc_336 0.4674844 5
188188
##
189189
## [[2]]
190190
## term1 term2 similarity rank
191-
## 1 doc_285 doc_286 0.5537772 1
192-
## 2 doc_285 doc_319 0.5478524 2
193-
## 3 doc_285 doc_874 0.5095125 3
194-
## 4 doc_285 doc_113 0.4878533 4
195-
## 5 doc_285 doc_76 0.4863345 5
191+
## 1 doc_285 doc_319 0.5318567 1
192+
## 2 doc_285 doc_286 0.5100293 2
193+
## 3 doc_285 doc_113 0.5056069 3
194+
## 4 doc_285 doc_526 0.4840761 4
195+
## 5 doc_285 doc_488 0.4805686 5
196196
```
197197

198198
```r
@@ -206,19 +206,19 @@ nn
206206
```
207207
## $sent1
208208
## term1 term2 similarity rank
209-
## 1 sent1 doc_742 0.4385398 1
210-
## 2 sent1 doc_776 0.4269895 2
211-
## 3 sent1 doc_740 0.4247892 3
212-
## 4 sent1 doc_206 0.4162723 4
213-
## 5 sent1 doc_509 0.4153925 5
209+
## 1 sent1 doc_742 0.4830917 1
210+
## 2 sent1 doc_151 0.4340138 2
211+
## 3 sent1 doc_825 0.4263285 3
212+
## 4 sent1 doc_740 0.4059283 4
213+
## 5 sent1 doc_776 0.4024554 5
214214
##
215215
## $sent2
216216
## term1 term2 similarity rank
217-
## 1 sent2 doc_105 0.5738307 1
218-
## 2 sent2 doc_863 0.5229421 2
219-
## 3 sent2 doc_862 0.4981593 3
220-
## 4 sent2 doc_186 0.4873295 4
221-
## 5 sent2 doc_18 0.4671208 5
217+
## 1 sent2 doc_105 0.5497447 1
218+
## 2 sent2 doc_863 0.5061581 2
219+
## 3 sent2 doc_862 0.4973840 3
220+
## 4 sent2 doc_620 0.4793786 4
221+
## 5 sent2 doc_186 0.4755909 5
222222
```
223223

224224
```r

src/doc2vec/common_define.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
#define MAX_EXP 6
1414
#define MAX_SENTENCE_LENGTH 1000
1515
#define MAX_CODE_LENGTH 40
16-
#define MAX_DOC2VEC_KNN_R 100
1716
#define MAX_DOC2VEC_KNN 2000
1817
const int vocab_hash_size = 30000000;
1918
const int negtive_sample_table_size = 1e8;

src/rcpp_doc2vec.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ std::vector<std::string> paragraph2vec_dictionary(SEXP ptr, std::string type = "
9494
// [[Rcpp::export]]
9595
Rcpp::DataFrame paragraph2vec_nearest(SEXP ptr, std::string x, int top_n = 10, std::string type = "doc2doc") {
9696
Rcpp::XPtr<Doc2Vec> model(ptr);
97-
knn_item_t knn_items[MAX_DOC2VEC_KNN_R];
97+
knn_item_t knn_items[100];
9898
if(type == "doc2doc"){
9999
model->doc_knn_docs(x.c_str(), knn_items, top_n);
100100
}else if(type == "word2doc"){
@@ -114,7 +114,7 @@ Rcpp::DataFrame paragraph2vec_nearest(SEXP ptr, std::string x, int top_n = 10, s
114114
distance.push_back(kv.similarity);
115115
r = r + 1;
116116
rank.push_back(r);
117-
if(r >= top_n || r >= MAX_DOC2VEC_KNN_R) {
117+
if(r >= top_n || r >= 100) {
118118
break;
119119
}
120120
}
@@ -149,7 +149,7 @@ Rcpp::List paragraph2vec_nearest_sentence(SEXP ptr, Rcpp::List x, int top_n = 10
149149
}
150150
model->infer_doc(&doc, infer_vector);
151151
// Get closest docs to sentence
152-
knn_item_t knn_items[MAX_DOC2VEC_KNN_R];
152+
knn_item_t knn_items[100];
153153
model->sent_knn_docs(&doc, knn_items, top_n, infer_vector);
154154
// Collect result in data.frame
155155
std::vector<std::string> keys;
@@ -162,7 +162,7 @@ Rcpp::List paragraph2vec_nearest_sentence(SEXP ptr, Rcpp::List x, int top_n = 10
162162
distance.push_back(kv.similarity);
163163
r = r + 1;
164164
rank.push_back(r);
165-
if(r >= top_n || r >= MAX_DOC2VEC_KNN_R) {
165+
if(r >= top_n || r >= 100) {
166166
break;
167167
}
168168
}

0 commit comments

Comments
 (0)