2020-04-27 10:17:53 +02:00
#############################################
###
### Download Springer Books
### Corona Virus Time
###
### carlos@cardenas.pe
###
### GPL 3.0 v
###
### 27/04/2020
###
#############################################
import PyPDF2
import urllib3
2020-05-04 02:32:16 +02:00
import os
2020-04-27 10:17:53 +02:00
import wget
2020-05-04 02:32:16 +02:00
def download_book_from_page ( page_url ) :
http = urllib3 . PoolManager ( )
2020-04-27 10:17:53 +02:00
2020-05-04 02:32:16 +02:00
res = http . request ( ' GET ' , page_url )
2020-04-27 10:17:53 +02:00
2020-05-04 02:32:16 +02:00
title = ' ' . join ( res . data . decode ( ' utf-8 ' ) . split ( ' h1 ' ) [ 1 ] . split ( ' > ' ) [ 1 ] . split ( ' < ' ) [ 0 ] . split ( ' / ' ) [ 0 ] ) + " .pdf "
# skip books already downloaded
if os . path . isfile ( title ) :
return
2020-04-27 10:17:53 +02:00
2020-05-04 02:32:16 +02:00
download_url = " https://link.springer.com/content/ " + res . data . decode ( ' utf-8 ' ) . split ( ' Download book PDF ' ) [ 0 ] . split ( ' content/ ' ) [ 1 ] . split ( ' title ' ) [ 0 ] . split ( ' .pdf ' ) [ 0 ] + " .pdf "
2020-04-27 10:17:53 +02:00
2020-05-04 02:32:16 +02:00
wget . download ( download_url , title )
2020-04-27 10:17:53 +02:00
2020-05-04 02:32:16 +02:00
def process_books_in_pdf ( pdf ) :
for i in range ( 0 , pdf . numPages ) :
lines = pdf . getPage ( i ) . extractText ( ) . split ( ' \n ' )
2020-04-27 10:17:53 +02:00
2020-05-04 02:48:45 +02:00
i = 0
no_of_lines = len ( lines )
while i < no_of_lines :
2020-05-04 02:32:16 +02:00
if lines [ i ] . startswith ( " http:// " ) :
# changing protocol from http to https
url = " https:// " + lines [ i ] [ 7 : ]
print ( url )
2020-05-04 02:48:45 +02:00
try :
download_book_from_page ( url )
except :
print ( " Error while downloading, trying again. " )
continue
i + = 1
2020-04-27 10:17:53 +02:00
2020-05-04 02:32:16 +02:00
def main ( ) :
file = open ( ' Spring.pdf ' , ' rb ' )
pdf = PyPDF2 . PdfFileReader ( file )
process_books_in_pdf ( pdf )
2020-04-27 10:17:53 +02:00
2020-05-04 02:32:16 +02:00
main ( )