26 lines
762 B
Python
26 lines
762 B
Python
"""Module for ingesting Docx files containing quotes."""
|
|
|
|
import docx
|
|
from typing import List
|
|
from .IngestorInterface import IngestorInterface
|
|
from .QuoteModel import QuoteModel
|
|
|
|
|
|
class DocxIngestor(IngestorInterface):
|
|
"""Subclass for ingesting Docx files."""
|
|
|
|
allowed_extensions = ["docx"]
|
|
|
|
@classmethod
|
|
def parse(cls, path: str) -> List[QuoteModel]:
|
|
"""Parse the Docx file to extract quotes."""
|
|
if not cls.can_ingest(path):
|
|
raise Exception("Invalid ingest path")
|
|
quotes = []
|
|
doc = docx.Document(path)
|
|
for para in doc.paragraphs:
|
|
if para.text != "":
|
|
parts = para.text.split(" - ")
|
|
quotes.append(QuoteModel(parts[0], parts[1]))
|
|
return quotes
|