Spaces:
Runtime error
Runtime error
| """Loader that loads Telegram chat json dump.""" | |
| import json | |
| import pandas as pd | |
| from pathlib import Path | |
| from typing import List | |
| from langchain.docstore.document import Document | |
| from langchain.document_loaders.base import BaseLoader | |
| def concatenate_rows(row): | |
| date = row['date'] | |
| sender = row['from'] | |
| text = row['text'] | |
| return f'{sender} on {date}: {text}\n\n' | |
| class TelegramChatLoader(BaseLoader): | |
| """Loader that loads Telegram chat json directory dump.""" | |
| def __init__(self, path: str): | |
| """Initialize with path.""" | |
| self.file_path = path | |
| def load(self) -> List[Document]: | |
| """Load documents.""" | |
| p = Path(self.file_path) | |
| with open(p, encoding="utf8") as f: | |
| d = json.load(f) | |
| normalized_messages = pd.json_normalize(d['messages']) | |
| df_normalized_messages = pd.DataFrame(normalized_messages) | |
| # Only keep plain text messages (no services, nor links, hashtags, code, bold ...) | |
| df_filtered = df_normalized_messages[ | |
| (df_normalized_messages.type == "message") & | |
| (df_normalized_messages.text.apply(lambda x: type(x) == str)) | |
| ] | |
| df_filtered = df_filtered[["date", "text", "from"]] | |
| text = df_filtered.apply(concatenate_rows, axis=1).str.cat(sep='') | |
| metadata = {"source": str(p)} | |
| return [Document(page_content=text, metadata=metadata)] | |