diff --git a/.gitignore b/.gitignore index 32b7418..8428ee2 100644 --- a/.gitignore +++ b/.gitignore @@ -82,7 +82,7 @@ celerybeat-schedule *.sage.py # Environments -.env +*.env .venv env/ venv/ @@ -103,4 +103,15 @@ venv.bak/ # mypy .mypy_cache/ -*.env +# VSCode +.vscode + +# SonarLint +.sonarlint + +# IntelliJ +.idea/ + +# Downloaded Media +media/ +book/ diff --git a/README.md b/README.md index 24ff5a7..b038190 100644 --- a/README.md +++ b/README.md @@ -5,15 +5,17 @@ Script to download all your PacktPub books inspired by https://github.com/ozziep Since PacktPub restructured their website [packtpub-library-downloader](https://github.com/ozzieperez/packtpub-library-downloader) became obsolete because the downloader used webscraping. So I figured out that now PacktPub uses a REST API. Then I found which endpoint to use for downloading books and made a simple script. Feel free to fork and PR to improve. Packtpub's API isn't documented :'( ## Usage: + pip install -r requirements.txt - python main.py -e -p [-d -b -s -v -q] + python main.py -e -p [-d -b -s -v -q] [-i ] ##### Example: Download books in PDF format - python main.py -e hello@world.com -p p@ssw0rd -d ~/Desktop/packt -b pdf,epub,mobi,code + + python main.py -e hello@world.com -p p@ssw0rd -d ~/Desktop/packt -b pdf,epub,mobi,code ## Docker integration -You must put your data in the `.env` file. +You must put your data in the `.env` file. ``` mv data.env-sample data.env @@ -27,21 +29,24 @@ docker-compose up After the execution, you can see the content in the `book` directory. - ## Commandline Options -- *-e*, *--email* = Your login email -- *-p*, *--password* = Your login password -- *-d*, *--directory* = Directory to download into. Default is "media/" in the current directory -- *-b*, *--books* = Assets to download. Options are: *pdf,mobi,epub,code* -- *-s*, *--separate* = Create a separate directory for each book -- *-v*, *--verbose* = Show more detailed information -- *-q*, *--quiet* = Don't show information or progress bars + +- _-e_, _--email_ = Your login email +- _-p_, _--password_ = Your login password +- _-d_, _--directory_ = Directory to download into. Default is "media/" in the current directory +- _-b_, _--books_ = Assets to download. Options are: _pdf,mobi,epub,code,video_ +- _-s_, _--separate_ = Create a separate directory for each book +- _-v_, _--verbose_ = Show more detailed information +- _-q_, _--quiet_ = Don't show information or progress bars +- _-i_, _--ids_ = Products to download by id (If it is not specified, it will download all products that you have purchased) +- _-R_, _--readme_ = Create a README.md file with info of the book (_--separate_ option required) **Book File Types** -- *pdf*: PDF format -- *mobi*: MOBI format -- *epub*: EPUB format -- *code*: Accompanying source code, saved as .zip files +- _pdf_: PDF format +- _mobi_: MOBI format +- _epub_: EPUB format +- _code_: Accompanying source code, saved as .zip files +- _video_: Some courses are in video format -I'm working on Python 3.6.0 +I'm working on Python 3.6.0 diff --git a/config.py b/config.py index e0861fd..9b5faec 100644 --- a/config.py +++ b/config.py @@ -9,12 +9,21 @@ # this is base url where i do the requests BASE_URL = "https://services.packtpub.com/" +# this is base url for static content +BASE_STATIC_URL = "https://static.packt-cdn.com/" + # URL to request jwt token, params by post are user and pass, return jwt token AUTH_ENDPOINT = "auth-v1/users/tokens" # URL to get all your books, two params that i change are offset and limit, method GET PRODUCTS_ENDPOINT = "entitlements-v1/users/me/products?sort=createdAt:DESC&offset={offset}&limit={limit}" +# URL(BASE_STATIC) to get book information from id +PRODUCT_FROM_ID_ENDPOINT = "products/{id}/summary" + +# URL(BASE_STATIC) to get author information from id +AUTHOR_FROM_ID_ENDPOINT = "authors/{id}" + # URL to get types , param is book id, method GET URL_BOOK_TYPES_ENDPOINT = "products-v1/products/{book_id}/types" diff --git a/data.env-sample b/data.env-sample index c50c559..6f95966 100644 --- a/data.env-sample +++ b/data.env-sample @@ -1,2 +1,3 @@ EMAIL=email@example.com -PASSWORD=example$password \ No newline at end of file +PASSWORD=example$password +IDS= \ No newline at end of file diff --git a/entrypoint.sh b/entrypoint.sh index 51bf5cb..c06147e 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -1,2 +1,8 @@ pip install -r /app/requirements.txt -python /app/main.py -e $EMAIL -p $PASSWORD -d /app/book -b pdf,mobi,epub,code \ No newline at end of file + +if [[ -z "${IDS}" ]]; then +# python /app/main.py -e $EMAIL -p $PASSWORD -d /app/book -b pdf,mobi,epub,code,video -s -v + python /app/main.py -e $EMAIL -p $PASSWORD -d /app/book -b pdf,mobi,epub -s -v -R +else + python /app/main.py -e $EMAIL -p $PASSWORD -d /app/book -b pdf,mobi,epub,code,video -s -v -R -i $IDS +fi \ No newline at end of file diff --git a/main.py b/main.py index dd6f88d..8b679de 100755 --- a/main.py +++ b/main.py @@ -9,23 +9,38 @@ import getopt import requests from tqdm import tqdm, trange -from config import BASE_URL, PRODUCTS_ENDPOINT, URL_BOOK_TYPES_ENDPOINT, URL_BOOK_ENDPOINT +from config import BASE_URL, BASE_STATIC_URL, PRODUCTS_ENDPOINT, PRODUCT_FROM_ID_ENDPOINT, URL_BOOK_TYPES_ENDPOINT, URL_BOOK_ENDPOINT, AUTHOR_FROM_ID_ENDPOINT from user import User +error_message = 'Usage: main.py -e -p [-d -b -i -sR -v -q]' -#TODO: I should do a function that his only purpose is to request and return data + +# TODO: I should do a function that his only purpose is to request and return data def book_request(user, offset=0, limit=10, verbose=False): data = [] url = BASE_URL + PRODUCTS_ENDPOINT.format(offset=offset, limit=limit) if verbose: - print(url) + tqdm.write(url) r = requests.get(url, headers=user.get_header()) data += r.json().get('data', []) return url, r, data + +def book_from_id_request(book_id, verbose=False): + url = BASE_STATIC_URL + PRODUCT_FROM_ID_ENDPOINT.format(id=book_id) + if verbose: + tqdm.write(url) + + r = requests.get(url) + rjson = r.json() + data = {'productId': book_id, 'productName': rjson.get('title')} + + return url, r, data + + def get_books(user, offset=0, limit=10, is_verbose=False, is_quiet=False): - ''' + """ Request all your books, return json with info of all your books Params ... @@ -33,14 +48,14 @@ def get_books(user, offset=0, limit=10, is_verbose=False, is_quiet=False): offset : int limit : int how many book wanna get by request - ''' + """ # TODO: given x time jwt expired and should refresh the header, user.refresh_header() - + url, r, data = book_request(user, offset, limit) - + print(f'You have {str(r.json()["count"])} books') print("Getting list of books...") - + if not is_quiet: pages_list = trange(r.json()['count'] // limit, unit='Pages') else: @@ -51,54 +66,77 @@ def get_books(user, offset=0, limit=10, is_verbose=False, is_quiet=False): return data -def get_url_book(user, book_id, format='pdf'): - ''' +def get_books_from_ids(ids, is_verbose=False, is_quiet=False): + """ + Get all books from id + Params + ... + ids : list + """ + + data = [] + + print("Getting list of books...") + + if not is_quiet: + id_iter = tqdm(ids, unit="Pages") + else: + id_iter = ids + + for book_id in id_iter: + data.append(book_from_id_request(book_id, is_verbose)[2]) + + return data + + +def get_url_book(user, book_id, file_format='pdf'): + """ Return url of the book to download - ''' - - url = BASE_URL + URL_BOOK_ENDPOINT.format(book_id=book_id, format=format) + """ + + url = BASE_URL + URL_BOOK_ENDPOINT.format(book_id=book_id, format=file_format) r = requests.get(url, headers=user.get_header()) - if r.status_code == 200: # success + if r.status_code == 200: # success return r.json().get('data', '') - elif r.status_code == 401: # jwt expired - user.refresh_header() # refresh token - get_url_book(user, book_id, format) # call recursive - - print('ERROR (please copy and paste in the issue)') - print(r.json()) - print(r.status_code) - return '' + elif r.status_code == 401: # jwt expired + user.refresh_header() # refresh token + get_url_book(user, book_id, file_format) # call recursive + + tqdm.write('ERROR (please copy and paste in the issue): ' + str(r.status_code)) + for key,value in r.json().items(): + tqdm.write(' ' + key + ': ' + str(value)) + raise PermissionError('Could not download book: ' + book_id + ' in format: ' + file_format) def get_book_file_types(user, book_id): - ''' + """ Return a list with file types of a book - ''' + """ url = BASE_URL + URL_BOOK_TYPES_ENDPOINT.format(book_id=book_id) r = requests.get(url, headers=user.get_header()) - if (r.status_code == 200): # success + if r.status_code == 200: # success return r.json()['data'][0].get('fileTypes', []) - - elif (r.status_code == 401): # jwt expired - user.refresh_header() # refresh token - get_book_file_types(user, book_id, format) # call recursive - - print('ERROR (please copy and paste in the issue)') - print(r.json()) - print(r.status_code) + + elif r.status_code == 401: # jwt expired + user.refresh_header() # refresh token + return get_book_file_types(user, book_id) # call recursive + + tqdm.write('ERROR (please copy and paste in the issue): ' + str(r.status_code)) + for key, value in r.json().items(): + tqdm.write(' ' + key + ': ' + str(value)) return [] # TODO: i'd like that this functions be async and download faster -def download_book(filename, url): - ''' - Download your book - ''' - print('Starting to download ' + filename) +def download_file(filename, url): + """ + Download file + """ + tqdm.write('Starting to download ' + filename) with open(filename, 'wb') as f: r = requests.get(url, stream=True) @@ -107,17 +145,34 @@ def download_book(filename, url): f.write(response.content) else: total = int(total) - # TODO: read more about tqdm - for chunk in tqdm(r.iter_content(chunk_size=1024), total=math.ceil(total//1024), unit='KB', unit_scale=True): + progress = tqdm( + total=math.ceil(total), + unit='KB', + unit_scale=True, + mininterval=1 + ) + for chunk in r.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks f.write(chunk) f.flush() - print('Finished ' + filename) + progress.update(1024) + progress.close() + tqdm.write('Finished ' + filename) + + +def get_book_name(book, file_type): + book_name = book['productName'].replace(' ', '_').replace('.', '_').replace(':', '_').replace('/','').replace('?','') + if file_type == 'video' or file_type == 'code': + return book_name, book_name + '.' + file_type + '.zip' + else: + return book_name, book_name + '.' + file_type def make_zip(filename): if filename[-4:] == 'code': - os.replace(filename, filename[:-4] + 'zip') + os.replace(filename, filename[:-4] + 'code.zip') + elif filename[-5:] == 'video': + os.replace(filename, filename[:-5] + 'video.zip') def move_current_files(root, book): @@ -129,37 +184,161 @@ def move_current_files(root, book): except OSError: os.rename(f, f'{sub_dir}/{book}' + '_1' + f[f.index('.'):]) except ValueError as e: - print(e) - print('Skipping') + tqdm.write(repr(e)) + tqdm.write('Skipping') + + +def download_book_by_type(user, book, file_type, separate, root_directory, verbose=False): + book_name, book_filename = get_book_name(book, file_type) + if separate: + filename = f'{root_directory}/{book_name}/{book_filename}' + move_current_files(root_directory, book_name) + else: + filename = f'{root_directory}/{book_filename}' + + if os.path.exists(filename): + if verbose: + tqdm.write(f'{filename} already exists, skipping.') + return + + try: + # get url of the book to download + url = get_url_book(user, book['productId'], file_type) + download_file(filename, url) + except PermissionError as e: + tqdm.write(repr(e)) + tqdm.write('Skipping') + + +def download_all_books(user, books, book_file_types, separate, root_directory, verbose=False, quiet=False): + tqdm.write('Downloading books...') + if not quiet: + books_iter = tqdm(books, unit='Book') + else: + books_iter = books + for book in books_iter: + # get the different file type of current book + file_types = get_book_file_types(user, book['productId']) + for file_type in file_types: + if file_type in book_file_types: # check if the file type entered is available by the current book + download_book_by_type( + user, book, file_type, separate, root_directory, verbose) def does_dir_exist(directory): + # Check if directory not exists if not os.path.exists(directory): try: + # try making dir if not exists os.makedirs(directory) except Exception as e: print(e) sys.exit(2) +def get_author_info(author_id): + url = BASE_STATIC_URL + AUTHOR_FROM_ID_ENDPOINT.format(id=author_id) -def main(argv): - # thanks to https://github.com/ozzieperez/packtpub-library-downloader/blob/master/downloader.py + r = requests.get(url) + rjson = r.json() + return rjson + + +def get_book_info(book_id): + url = BASE_STATIC_URL + PRODUCT_FROM_ID_ENDPOINT.format(id=book_id) + + r = requests.get(url) + rjson = r.json() + authors = [] + + + try: + for author in rjson.get('authors'): + authors.append(get_author_info(author).get('author')) + data = { + 'title': rjson.get('title'), + 'authors': authors, + 'isbn13': rjson.get('isbn13'), + 'description': rjson.get('oneLiner'), + 'pages': rjson.get('pages'), + 'releaseDate': rjson.get('publicationDate')[:10], + 'category': rjson.get('category'), + 'homepage': f"https://subscription.packtpub.com{rjson.get('readUrl')}" + } + except: + pass + + return data + +# TODO: Get link to Github repository where present (see book 9781789957754) +def create_readme(path, book): + filename = os.path.join(path, 'README.md') + try: + data = get_book_info(book['productId']) + + with open(filename, 'w', encoding='utf-8') as file: + file.write(f"# {str(data['title'])}\n") + file.write('\n') + file.write(f"- By {', '.join(data['authors'])}\n") + file.write(f"- Publication date: {data['releaseDate']}\n") + file.write(f"- ISBN: {data['isbn13']}\n") + file.write(f"- Pages: {data['pages']}\n") + file.write('\n') + file.write(data['description'] + '\n') + file.write('\n') + file.write(f"* [Book Home Page]({data['homepage']})\n") + for k, v in book['files'].items(): + file.write(f"* [{k.upper()}]({v})\n") + file.write('\n') + file.write('') + except Exception as e: + pass + + +def get_opts_args(argv): + try: + return getopt.getopt( + argv, + 'e:p:d:b:i:svqR', + [ + 'email=', + 'pass=', + 'directory=', + 'books=', + 'ids=', + 'separate', + 'verbose', + 'quiet', + 'readme' + ] + ) + except getopt.GetoptError: + print(error_message) + sys.exit(2) + +def check_arg(email, password, verbose, quiet): + # Is this true? + if verbose and quiet: + print("Verbose and quiet cannot be used together.") + sys.exit(2) + + # do we have the minimum required info? + if not email or not password: + print(error_message) + sys.exit(2) + +def parse_args(argv): email = None password = None - root_directory = 'media' + root_directory = 'media' book_file_types = ['pdf', 'mobi', 'epub', 'code'] separate = None verbose = None quiet = None - errorMessage = 'Usage: main.py -e -p [-d -b -s -v -q]' + download_ids = None + readme = None - # get the command line arguments/options - try: - opts, args = getopt.getopt( - argv, 'e:p:d:b:svq', ['email=', 'pass=', 'directory=', 'books=', 'separate', 'verbose', 'quiet']) - except getopt.GetoptError: - print(errorMessage) - sys.exit(2) + # get all options from argument + opts, args = get_opts_args(argv) # hold the values of the command line options for opt, arg in opts: @@ -178,15 +357,34 @@ def main(argv): verbose = True elif opt in ('-q', '--quiet'): quiet = True + elif opt in ('-R', '--readme'): + readme = True + elif opt in ('-i', '--ids'): + download_ids = arg.split(',') - if verbose and quiet: - print("Verbose and quiet cannot be used together.") - sys.exit(2) + check_arg(email, password, verbose, quiet) - # do we have the minimum required info? - if not email or not password: - print(errorMessage) - sys.exit(2) + return email, \ + password, \ + root_directory, \ + book_file_types, \ + separate, verbose, \ + quiet, \ + download_ids, \ + readme + + +def main(argv): + # thanks to https://github.com/ozzieperez/packtpub-library-downloader/blob/master/downloader.py + email, \ + password, \ + root_directory, \ + book_file_types, \ + separate, \ + verbose, \ + quiet, \ + download_ids, \ + readme = parse_args(argv) # check if not exists dir and create does_dir_exist(root_directory) @@ -195,8 +393,12 @@ def main(argv): user = User(email, password) # get all your books - books = get_books(user, is_verbose=verbose, is_quiet=quiet) - print('Downloading books...') + if download_ids: + books = get_books_from_ids( + download_ids, is_verbose=verbose, is_quiet=quiet) + else: + books = get_books(user, is_verbose=verbose, is_quiet=quiet) + tqdm.write('Downloading books...') if not quiet: books_iter = tqdm(books, unit='Book') else: @@ -204,22 +406,34 @@ def main(argv): for book in books_iter: # get the different file type of current book file_types = get_book_file_types(user, book['productId']) + tqdm.write('Requested formats: ' + ','.join(book_file_types) + ' but only available: ' + ','.join(file_types)) + book_name = book['productName'].replace(' ', '_').replace('.', '_').replace(':', '_').replace('/','').replace('?','') + book['files'] = {} + if separate: + filepath = f'{root_directory}/{book_name}' + move_current_files(root_directory, book_name) + else: + filepath = f'{root_directory}' for file_type in file_types: if file_type in book_file_types: # check if the file type entered is available by the current book - book_name = book['productName'].replace(' ', '_').replace('.', '_').replace(':', '_').replace('/','') - if separate: - filename = f'{root_directory}/{book_name}/{book_name}.{file_type}' - move_current_files(root_directory, book_name) - else: - filename = f'{root_directory}/{book_name}.{file_type}' - # get url of the book to download - url = get_url_book(user, book['productId'], file_type) - if not os.path.exists(filename) and not os.path.exists(filename.replace('.code', '.zip')): - download_book(filename, url) - make_zip(filename) - else: + filename = f'{filepath}/{book_name}.{file_type}' + book['files'][file_type] = f'{book_name}.{file_type}' + # implied check for pdf, epub, mobi, also avoid name collision when both code and video are available. + if os.path.exists(filename.replace('.code', '.code.zip').replace('.video', '.video.zip')): if verbose: tqdm.write(f'{filename} already exists, skipping.') + continue + + try: + # get url of the book to download + url = get_url_book(user, book['productId'], file_type) + download_file(filename, url) + make_zip(filename) + except PermissionError as e: + tqdm.write(repr(e)) + tqdm.write('Skipping') + if separate and readme: + create_readme(f'{filepath}', book) if __name__ == '__main__': diff --git a/requirements.txt b/requirements.txt index 6156c01..ae08abc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ aiofiles==0.4.0 -aiohttp==3.5.4 +aiohttp==3.7.4 async-timeout==3.0.1 attrs==18.2.0 certifi==2018.11.29 @@ -7,8 +7,8 @@ chardet==3.0.4 idna==2.8 idna-ssl==1.1.0 multidict==4.5.2 -requests==2.21.0 +requests==2.26.0 tqdm==4.30.0 typing-extensions==3.7.2 -urllib3==1.24.1 +urllib3==1.26.5 yarl==1.3.0 diff --git a/user.py b/user.py index 689cd70..2502752 100644 --- a/user.py +++ b/user.py @@ -5,6 +5,7 @@ import requests from config import BASE_URL, AUTH_ENDPOINT + class User: """ User object that contain his header @@ -13,30 +14,44 @@ class User: password = "" # need to fill Authoritazion with current token provide by api header = { - "User-Agent": "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 " + + "User-Agent": + "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36", - "Authorization":"" - } - + "Authorization": "" + } + def __init__(self, username, password): self.username = username self.password = password self.header["Authorization"] = self.get_token() - + def get_token(self): """ Request auth endpoint and return user token """ - url = BASE_URL+AUTH_ENDPOINT - # use json paramenter because for any reason they send user and pass in plain text :'( - r = requests.post(url, json={'username':self.username, 'password':self.password}) + url = BASE_URL + AUTH_ENDPOINT + # use json paramenter because for any reason they send user and pass in plain text :'( + r = requests.post(url, + json={ + 'username': self.username, + 'password': self.password + }) + if r.status_code == 200: + print("You are in!") + return 'Bearer ' + r.json()['data']['access'] + + r = requests.post(url, + json={ + 'username': self.username, + 'password': self.password + }) if r.status_code == 200: print("You are in!") return 'Bearer ' + r.json()['data']['access'] - - # except should happend when user and pass are incorrect + + # except should happend when user and pass are incorrect print("Error login, check user and password") - print("Error {}".format(e)) + #print("Error {}".format(e)) sys.exit(2) def get_header(self): @@ -49,4 +64,3 @@ def refresh_header(self): self.header["Authorization"] = self.get_token() return self.header -