|
|
|
|
|
import re |
|
from pathlib import Path |
|
from typing import List |
|
|
|
from langchain.docstore.document import Document |
|
from langchain.document_loaders.base import BaseLoader |
|
|
|
|
|
def concatenate_rows(date: str, sender: str, text: str) -> str: |
|
"""Combine message information in a readable format ready to be used.""" |
|
return f"{sender} on {date}: {text}\n\n" |
|
|
|
|
|
|
|
|
|
|
|
class WhatsAppChatLoader(BaseLoader): |
|
"""Load `WhatsApp` messages text file.""" |
|
|
|
def __init__(self, path: str): |
|
"""Initialize with path.""" |
|
self.file_path = path |
|
|
|
def load(self) -> List[Document]: |
|
"""Load documents.""" |
|
p = Path(self.file_path) |
|
text_content = "" |
|
|
|
ignore_lines = ["This message was deleted", "<Media omitted>"] |
|
|
|
|
|
|
|
|
|
import pandas as pd |
|
df = pd.read_csv(p)[['date', 'username', 'message']] |
|
|
|
for i,row in df.iterrows(): |
|
date = row['date'] |
|
sender = row['username'] |
|
text = row['message'] |
|
|
|
if not any(x in text for x in ignore_lines): |
|
text_content += concatenate_rows(date, sender, text) |
|
|
|
metadata = {"source": str(p)} |
|
|
|
return [Document(page_content=text_content.strip(), metadata=metadata)] |