61 lines
1.5 KiB
Python
61 lines
1.5 KiB
Python
#############################################
|
|
###
|
|
### Download Springer Books
|
|
### Corona Virus Time
|
|
###
|
|
### carlos@cardenas.pe
|
|
###
|
|
### GPL 3.0 v
|
|
###
|
|
### 27/04/2020
|
|
###
|
|
#############################################
|
|
import PyPDF2
|
|
import urllib3
|
|
import os
|
|
import wget
|
|
|
|
|
|
def download_book_from_page(page_url):
|
|
http = urllib3.PoolManager()
|
|
|
|
res = http.request('GET', page_url)
|
|
|
|
title = ''.join(res.data.decode('utf-8').split('h1')[1].split('>')[1].split('<')[0].split('/')[0])+".pdf"
|
|
|
|
# skip books already downloaded
|
|
if os.path.isfile(title):
|
|
return
|
|
|
|
download_url = "https://link.springer.com/content/"+res.data.decode('utf-8').split('Download book PDF')[0].split('content/')[1].split('title')[0].split('.pdf')[0]+".pdf"
|
|
|
|
wget.download(download_url, title)
|
|
|
|
|
|
def process_books_in_pdf(pdf):
|
|
for i in range(0, pdf.numPages):
|
|
lines = pdf.getPage(i).extractText().split('\n')
|
|
|
|
i = 0
|
|
no_of_lines = len(lines)
|
|
while i < no_of_lines:
|
|
if lines[i].startswith("http://"):
|
|
# changing protocol from http to https
|
|
url = "https://"+lines[i][7:]
|
|
print(url)
|
|
try:
|
|
download_book_from_page(url)
|
|
except:
|
|
print("Error while downloading, trying again.")
|
|
continue
|
|
i += 1
|
|
|
|
|
|
def main():
|
|
file = open('Spring.pdf', 'rb')
|
|
pdf = PyPDF2.PdfFileReader(file)
|
|
process_books_in_pdf(pdf)
|
|
|
|
|
|
main()
|