@@ -50,24 +50,18 @@ x <- subset(x, nwords < 1000 & nchar(text) > 0)
5050- Build the model
5151
5252
53- ``` r
54- model <- paragraph2vec(x = x , type = " PV-DBOW" , dim = 100 , iter = 20 , min_count = 5 ,
55- lr = 0.05 , threads = 4 )
56- ```
57-
58-
5953``` r
6054# # Low-dimensional model using DM, low number of iterations, for speed and display purposes
61- model <- paragraph2vec(x = x , type = " PV-DM" , dim = 5 , iter = 3 , min_count = 5 ,
62- lr = 0.05 , threads = 1 )
55+ model <- paragraph2vec(x = x , type = " PV-DM" , dim = 5 , iter = 3 ,
56+ min_count = 5 , lr = 0.05 , threads = 1 )
6357str(model )
6458```
6559
6660```
6761## List of 3
6862## $ model :<externalptr>
6963## $ data :List of 4
70- ## ..$ file : chr "C:\\Users\\Jan\\AppData\\Local\\Temp\\RtmpApjuPd\\textspace_1ef05c50176 .txt"
64+ ## ..$ file : chr "C:\\Users\\Jan\\AppData\\Local\\Temp\\Rtmpk9Npjg\\textspace_1c4458cb6943 .txt"
7165## ..$ n : num 170469
7266## ..$ n_vocabulary: num 3867
7367## ..$ n_docs : num 1000
@@ -84,6 +78,13 @@ str(model)
8478## - attr(*, "class")= chr "paragraph2vec_trained"
8579```
8680
81+
82+ ``` r
83+ # # More realistic model
84+ model <- paragraph2vec(x = x , type = " PV-DBOW" , dim = 100 , iter = 20 ,
85+ min_count = 5 , lr = 0.05 , threads = 4 )
86+ ```
87+
8788- Get the embedding of the documents or words and get the vocabulary
8889
8990
@@ -104,14 +105,22 @@ sentences <- list(
104105embedding <- predict(model , newdata = sentences , type = " embedding" )
105106embedding <- predict(model , newdata = c(" geld" , " koning" ), type = " embedding" , which = " words" )
106107embedding <- predict(model , newdata = c(" doc_1" , " doc_10" , " doc_3" ), type = " embedding" , which = " docs" )
107- embedding
108+ ncol( embedding )
108109```
109110
110111```
111- ## [,1] [,2] [,3] [,4] [,5]
112- ## doc_1 0.09160496 0.5503142 -0.5195833 0.162630379 -0.62637627
113- ## doc_10 0.43539885 0.1009961 -0.8531511 0.266749799 0.03471836
114- ## doc_3 0.59375095 0.3877517 -0.6868675 0.002579026 -0.15910600
112+ ## [1] 100
113+ ```
114+
115+ ``` r
116+ embedding [, 1 : 4 ]
117+ ```
118+
119+ ```
120+ ## [,1] [,2] [,3] [,4]
121+ ## doc_1 0.08172660 -0.03679979 0.05726605 -0.06496991
122+ ## doc_10 0.13976580 0.10821507 -0.06986591 -0.05825572
123+ ## doc_3 0.09486584 -0.07999156 0.03448128 0.02999697
115124```
116125
117126- Get similar documents or words when providing sentences, documents or words
124133
125134```
126135## [[1]]
127- ## term1 term2 similarity rank
128- ## 1 proximus neemt 0.9994797 1
129- ## 2 proximus plaatse 0.9994527 2
130- ## 3 proximus ver 0.9993714 3
131- ## 4 proximus gratis 0.9992922 4
132- ## 5 proximus hiermee 0.9992417 5
136+ ## term1 term2 similarity rank
137+ ## 1 proximus telefoontoestellen 0.5571629 1
138+ ## 2 proximus belfius 0.4994604 2
139+ ## 3 proximus toenmalige 0.4873388 3
140+ ## 4 proximus internetverbinding 0.4730936 4
141+ ## 5 proximus gefactureerd 0.4568973 5
133142##
134143## [[2]]
135- ## term1 term2 similarity rank
136- ## 1 koning pleiten 0.9984228 1
137- ## 2 koning ongeacht 0.9983451 2
138- ## 3 koning pensionering 0.9982112 3
139- ## 4 koning profielen 0.9981233 4
140- ## 5 koning beschermd 0.9978001 5
144+ ## term1 term2 similarity rank
145+ ## 1 koning grondwet 0.5572801 1
146+ ## 2 koning verplaatsingen 0.5373006 2
147+ ## 3 koning ministerie 0.5140343 3
148+ ## 4 koning familie 0.4943074 4
149+ ## 5 koning vereiste 0.4715540 5
141150```
142151
143152``` r
148157```
149158## [[1]]
150159## term1 term2 similarity rank
151- ## 1 proximus doc_77 0.9989672 1
152- ## 2 proximus doc_263 0.9989251 2
153- ## 3 proximus doc_260 0.9982057 3
154- ## 4 proximus doc_344 0.9980863 4
155- ## 5 proximus doc_408 0.9979483 5
160+ ## 1 proximus doc_105 0.6922343 1
161+ ## 2 proximus doc_863 0.5826316 2
162+ ## 3 proximus doc_186 0.5146015 3
163+ ## 4 proximus doc_862 0.5051525 4
164+ ## 5 proximus doc_746 0.4467830 5
156165##
157166## [[2]]
158167## term1 term2 similarity rank
159- ## 1 koning doc_553 0.9980003 1
160- ## 2 koning doc_477 0.9964797 2
161- ## 3 koning doc_658 0.9955103 3
162- ## 4 koning doc_99 0.9953933 4
163- ## 5 koning doc_163 0.9953347 5
168+ ## 1 koning doc_44 0.6228581 1
169+ ## 2 koning doc_583 0.5643232 2
170+ ## 3 koning doc_45 0.5535781 3
171+ ## 4 koning doc_797 0.4408725 4
172+ ## 5 koning doc_943 0.4039679 5
164173```
165174
166175``` r
171180```
172181## [[1]]
173182## term1 term2 similarity rank
174- ## 1 doc_198 doc_882 0.9992993 1
175- ## 2 doc_198 doc_709 0.9990637 2
176- ## 3 doc_198 doc_122 0.9989671 3
177- ## 4 doc_198 doc_121 0.9988763 4
178- ## 5 doc_198 doc_569 0.9988336 5
183+ ## 1 doc_198 doc_343 0.4893735 1
184+ ## 2 doc_198 doc_569 0.4858374 2
185+ ## 3 doc_198 doc_358 0.4831750 3
186+ ## 4 doc_198 doc_498 0.4766597 4
187+ ## 5 doc_198 doc_983 0.4761481 5
179188##
180189## [[2]]
181190## term1 term2 similarity rank
182- ## 1 doc_285 doc_722 0.9988106 1
183- ## 2 doc_285 doc_467 0.9977189 2
184- ## 3 doc_285 doc_250 0.9976925 3
185- ## 4 doc_285 doc_174 0.9975280 4
186- ## 5 doc_285 doc_294 0.9968556 5
191+ ## 1 doc_285 doc_319 0.5304061 1
192+ ## 2 doc_285 doc_286 0.5205777 2
193+ ## 3 doc_285 doc_76 0.5086077 3
194+ ## 4 doc_285 doc_74 0.4975725 4
195+ ## 5 doc_285 doc_537 0.4802507 5
187196```
188197
189198``` r
197206```
198207## $sent1
199208## term1 term2 similarity rank
200- ## 1 sent1 doc_980 0.9784521 1
201- ## 2 sent1 doc_758 0.9678799 2
202- ## 3 sent1 doc_806 0.9547009 3
203- ## 4 sent1 doc_764 0.9544759 4
204- ## 5 sent1 doc_842 0.9529226 5
209+ ## 1 sent1 doc_740 0.4637638 1
210+ ## 2 sent1 doc_742 0.4621139 2
211+ ## 3 sent1 doc_206 0.4315273 3
212+ ## 4 sent1 doc_825 0.4221503 4
213+ ## 5 sent1 doc_151 0.4183135 5
205214##
206215## $sent2
207216## term1 term2 similarity rank
208- ## 1 sent2 doc_842 0.9873239 1
209- ## 2 sent2 doc_764 0.9832168 2
210- ## 3 sent2 doc_564 0.9739662 3
211- ## 4 sent2 doc_980 0.9675324 4
212- ## 5 sent2 doc_542 0.9622889 5
217+ ## 1 sent2 doc_105 0.5789919 1
218+ ## 2 sent2 doc_186 0.4938067 2
219+ ## 3 sent2 doc_862 0.4848365 3
220+ ## 4 sent2 doc_863 0.4685720 4
221+ ## 5 sent2 doc_620 0.4497271 5
213222```
214223
215224``` r
0 commit comments