pyrotechny-eu/cmd/calibre_to_hugo.py

275 lines
8.1 KiB
Python
Executable File

#!/Applications/calibre.app/Contents/MacOS/calibre-debug calibre_to_hugo.py
# See: https://manual.calibre-ebook.com/db_api.html
import os
import json
import copy
import shutil
import pathlib
import hashlib
import urllib
import calibre.library
import config
class CalibreLibrary: pass
class CalibreLibraryBook:
_db: CalibreLibrary
id: int
title: str
authors: str
filepath: str
filehash: str
filename: str
cover: str
def __init__(self, db: CalibreLibrary):
self._db = db
self.cover = None
self.filehash = None
""" Hash the filepath SHA256 to hex string """
def hash(self) -> str:
if not self.filehash:
self.filehash = hashlib.sha256(self.filepath.encode("utf8")).hexdigest()
return self.filehash
""" Get book properties for JSON serialize """
def to_json(self):
data = {}
for key in list(self.__dict__.keys()):
if key.startswith("_"):
continue
data[key] = self.__dict__[key]
return data
""" Generate the ebook filename based on the ebook file hash """
def ebook_filename(self) -> str:
filehash = self.hash()
ext = pathlib.Path(self.filepath).suffix
self.filename = f"{filehash}{ext}"
return self.filename
""" Save the ebook from the calibre library to the path """
def ebook_save(self, path: str) -> str:
filepath = os.path.join(path, self.ebook_filename())
if os.path.exists(filepath):
return
print(f"COPY {self.filepath} -> {filepath}")
shutil.copyfile(self.filepath, filepath)
return filepath
""" Save the ebook cover to the path """
def cover_save(self, path: str) -> str:
filehash = self.hash()
# NOTE: we assume by testing the cover is a .jpg so we check if it exists on disk already as "<hash>.jpg"
self.cover = os.path.join(path, f"{filehash}.jpg")
if os.path.exists(self.cover):
print(f"COVER {self.cover}")
return self.cover
cover_tmpfile = self._db.cover(self.id, as_path=True)
cover_ext = pathlib.Path(cover_tmpfile).suffix
self.cover = os.path.join(path, f"{filehash}{cover_ext}")
os.rename(cover_tmpfile, self.cover)
print(f"SAVE COVER {self.cover}")
return self.cover
class CalibreLibrary:
def __init__(self, library_path: str):
# First open the Calibre library and get a list of the book IDs
self._db = calibre.library.db(library_path).new_api
@staticmethod
def _get_filesize_str(path: str) -> str:
size = os.path.getsize(path)
if size < 1024:
return f"{size} bytes"
elif size < pow(1024,2):
return f"{round(size/1024, 2)} KB"
elif size < pow(1024,3):
return f"{round(size/(pow(1024,2)), 2)} MB"
elif size < pow(1024,4):
return f"{round(size/(pow(1024,3)), 2)} GB"
def books(self) -> 'list[CalibreLibraryBook]':
books = []
book_ids = self._db.all_book_ids()
for book_id in book_ids:
# TODO check loaded state with state of calibre based on book.id
# hashing takes way to long...
book = CalibreLibraryBook(self._db)
book.id = book_id
book.title = self._db.field_for("title", book.id)
book.authors = self._db.field_for("authors", book.id)
book.comments = self._db.field_for("comments", book.id)
book._metadata = self._db.get_metadata(book.id)
book.ids = book._metadata.get_identifiers()
# Select only first ebook format
formats = self._db.formats(book.id, verify_formats=True)
if len(formats) > 0:
book.filepath = self._db.format_abspath(book.id, formats[0])
book.filesize = self._get_filesize_str(book.filepath)
else:
book.filepath = None
book.filesize = 0
if book.hash() in config.CLEARWEB_FILTERED_BOOKS:
print(f"SKIP Book {book.title} filtered and not published on the clearweb")
continue
books.append(book)
return books
class PyroTechnyLibrary:
_calibre_library : CalibreLibrary
_path: str
_tempdir: str
_google_drive_file_db: list
def __init__(self, path: str, calibre_library: CalibreLibrary):
self._books = []
self._calibre_library = calibre_library
self._path = path
self._state = {}
self._state["books"] = []
# Create path directories
if not os.path.exists(self._path):
print(f"CREATE {self._path}")
os.makedirs(self._path, 0o755)
self._load_google_drive_file_db()
def _get_google_drive_value_from_filename(self, filename: str, key: str) -> str:
value = ""
for file in self._google_drive_file_db:
if filename == file["filename"]:
value = file[key]
break
return value
def _load_google_drive_file_db(self):
print(f"LOAD db.json from Google Drive: {config.GOOGLE_DRIVE_EBOOK_LIBRRARY_DB_JSON_URL}")
resp = urllib.request.urlopen(config.GOOGLE_DRIVE_EBOOK_LIBRRARY_DB_JSON_URL)
self._google_drive_file_db = json.loads(resp.read())
def _generate_book_dl_page(self, path, book):
pass
def _generate_book_page(self, path, book):
filepath = os.path.join(path, f'{book.filehash}.md')
if os.path.exists(filepath):
os.remove(filepath)
# TODO: When we us a template file we can check if the template
# or script is newer than the target book page markdown file. To speed things up a bit
print(f"GEN {filepath}")
with open(filepath, "w") as fd:
data = f'''---
title: "{book.title}"
description: ""
featured_image: "/images/site/library-header.jpg"
type: page
---
'''
fd.write(data)
cover = os.path.basename(book.cover)
cover_url = os.path.join(config.LIBRARY_EBOOKS_THUMBS_URL_PATH, cover)
book_url = self._get_google_drive_value_from_filename(book.filename, "view_url")
book_dl_url = self._get_google_drive_value_from_filename(book.filename, "download_url")
authors = list(book.authors)
authors = ', '.join(authors)
if len(book.authors) == 1:
if book.authors[0] == "Unknown":
authors = None
fd.write(f'<a href="{book_url}" target="_blank">![{cover}]({cover_url})</a>\n')
if authors:
author_suffix = ''
if len(book.authors) > 1:
author_suffix = 's'
fd.write(f"* Author{author_suffix}: {authors}\n")
# IDs to book libraries and publishers
if book.ids and len(book.ids) > 0:
fd.write(f'* IDs:\n')
for key, value in book.ids.items():
if key == "amazon":
fd.write(f' * Amazon: <a href="https://www.amazon.com/dp/{value}" target="_blank">{value}</a>\n')
elif key == "google":
fd.write(f' * Google: <a href="https://books.google.com/books?id={value}" target="_blank">{value}</a>\n')
elif key == "isbn":
fd.write(f' * ISBN: <a href="https://www.worldcat.org/isbn/{value}" target="_blank">{value}</a>\n')
elif key == "doi":
fd.write(f' * DOI: <a href="https://dx.doi.org/{value}" target="_blank">{value}</a>\n')
# View & Download
fd.write(f'* <a href="{book_url}" target="_blank">View</a>\n\n')
fd.write(f'* [Download]({book_dl_url}) ({book.filesize})\n\n')
# Comments
if book.comments:
fd.write(f'## Description')
fd.write(f'{book.comments}\n\n')
# Back
fd.write(f'<br />[Back to library]({config.LIBRARY_EBOOKS_BASE_URL}/)\n')
def synchronize(self):
# Load books from calibre
books = self._calibre_library.books()
for book in books:
book.ebook_filename()
book.cover_save(config.HUGO_STATIC_CONTENT_LIBRARY_IMAGES_PATH)
book.ebook_save(config.HUGO_STATIC_CONTENT_LIBRARY_PATH)
self._state["books"].append(book)
""" Generate hugo markdown content files """
def generate(self):
# per-book page generation
book_page_path = config.HUGO_CONTENT_LIBRARY_PATH
if not os.path.exists(book_page_path):
os.makedirs(book_page_path, 0o755)
# TODO synced state instead of calibre copy....
for book in self._state["books"]:
self._generate_book_page(book_page_path, book)
def generate_library_sitemap(self, filepath: str):
if os.path.exists(filepath):
os.remove(filepath)
with open(filepath, "w") as fd:
fd.write("# E-books library sitemap\n\n")
for book in self._state["books"]:
fd.write(f"* [{book.title}]({config.LIBRARY_EBOOKS_BASE_URL}/ebooks/{book.hash()})\n")
def main():
calibre_library = CalibreLibrary(config.CALIBRE_LIBRARY_PATH)
pyrotechny_library = PyroTechnyLibrary(config.HUGO_STATIC_CONTENT_LIBRARY_PATH, calibre_library)
pyrotechny_library.synchronize()
pyrotechny_library.generate()
pyrotechny_library.generate_library_sitemap(config.HUGO_CONTENT_LIBRARY_SITEMAP_PATH)
if __name__ == "__main__":
main()