AbeerTrial's picture
Duplicate from AbeerTrial/SOAPAssist
35b22df
"""Simple reader for mbox (mailbox) files."""
import os
from pathlib import Path
from typing import Any, List
from gpt_index.readers.base import BaseReader
from gpt_index.readers.file.mbox_parser import MboxParser
from gpt_index.readers.schema.base import Document
class MboxReader(BaseReader):
"""Mbox e-mail reader.
Reads a set of e-mails saved in the mbox format.
"""
def __init__(self) -> None:
"""Initialize."""
def load_data(self, input_dir: str, **load_kwargs: Any) -> List[Document]:
"""Load data from the input directory.
load_kwargs:
max_count (int): Maximum amount of messages to read.
message_format (str): Message format overriding default.
"""
docs: List[Document] = []
for (dirpath, dirnames, filenames) in os.walk(input_dir):
dirnames[:] = [d for d in dirnames if not d.startswith(".")]
for filename in filenames:
if filename.endswith(".mbox"):
filepath = os.path.join(dirpath, filename)
content = MboxParser(**load_kwargs).parse_file(Path(filepath))
for msg in content:
docs.append(Document(msg))
return docs