"""Module for ingesting PDF files containing quotes.""" import os import random import subprocess from typing import List from .IngestorInterface import IngestorInterface from .QuoteModel import QuoteModel class PDFIngestor(IngestorInterface): """Subclass for ingesting PDF files.""" allowed_extensions = ["pdf"] @classmethod def parse(cls, path: str) -> List[QuoteModel]: """Parse the PDF file to extract quotes.""" if not cls.can_ingest(path): raise Exception("Invalid ingest path") quotes = [] tmp = f"./tmp/{random.randint(0, 10000)}.txt" try: # pdftotext subprocess.call(["pdftotext", path, tmp]) with open(tmp, "r") as file: lines = file.readlines() except FileNotFoundError as err: print(f"Error: {err}") else: for line in lines: line = line.strip() if line: parts = line.split(" - ") quotes.append(QuoteModel(parts[0], parts[1])) finally: if os.path.exists(tmp): os.remove(tmp) return quotes