41 lines
1.2 KiB
Python
41 lines
1.2 KiB
Python
"""Module for ingesting PDF files containing quotes."""
|
|
|
|
import os
|
|
import random
|
|
import subprocess
|
|
from typing import List
|
|
from .IngestorInterface import IngestorInterface
|
|
from .QuoteModel import QuoteModel
|
|
|
|
|
|
class PDFIngestor(IngestorInterface):
|
|
"""Subclass for ingesting PDF files."""
|
|
|
|
allowed_extensions = ["pdf"]
|
|
|
|
@classmethod
|
|
def parse(cls, path: str) -> List[QuoteModel]:
|
|
"""Parse the PDF file to extract quotes."""
|
|
if not cls.can_ingest(path):
|
|
raise Exception("Invalid ingest path")
|
|
|
|
quotes = []
|
|
tmp = f"./tmp/{random.randint(0, 10000)}.txt"
|
|
try:
|
|
# pdftotext <input-pdf> <output-text-file>
|
|
call = subprocess.call(["pdftotext", path, tmp])
|
|
with open(tmp, "r") as file:
|
|
lines = file.readlines()
|
|
except FileNotFoundError as err:
|
|
print(f"Error: {err}")
|
|
else:
|
|
for line in lines:
|
|
line = line.strip()
|
|
if line:
|
|
parts = line.split(" - ")
|
|
quotes.append(QuoteModel(parts[0], parts[1]))
|
|
finally:
|
|
if os.path.exists(tmp):
|
|
os.remove(tmp)
|
|
return quotes
|