-
Notifications
You must be signed in to change notification settings - Fork 19
Expand file tree
/
Copy pathScribd_Downloader.py
More file actions
48 lines (35 loc) · 1.1 KB
/
Scribd_Downloader.py
File metadata and controls
48 lines (35 loc) · 1.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import bs4 as bs
from selenium import webdriver
from urllib import request
import os
import re
import img2pdf
browser = webdriver.Chrome()
url = 'https://www.scribd.com/document/284732331/Text-Monochrome-Colour-Television-R-R-Gulati-pdf'
browser.get(url)
source = browser.page_source
soup = bs.BeautifulSoup(source,"lxml")
images =[]
for element in soup.find_all('div', attrs={'class':"ie_fix"}):
try:
images.append(element.find('img').get('src'))
except:
pass
downloader = request.URLopener()
path = 'C:/Users/Vishal/Desktop/PYTHON BEST/'
name = url.split('/')[5][0:-23]
newpath = path+name
if not os.path.exists(newpath):
os.mkdir(name)
for image in images:
image_name = image.split('/')[-1]
pattern = r"(([0-9]+))"
match = re.search(pattern, image_name)
try:
global page
page = match.group()
except:
pass
downloader.retrieve(image,newpath+'/'+page+'.jpg')
with open('output.pdf',"wb") as f:
f.write(img2pdf.convert([newpath+'/'+image for image in os.listdir(newpath) if image.endswith('.jpg')]))