Spaces:
Sleeping
Sleeping
| import os | |
| import streamlit as st | |
| from app.db import supabase | |
| from langchain_community.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader | |
| def list_all_files(bucket_name, limit_per_page=1000): | |
| all_files = [] | |
| offset = 0 | |
| while True: | |
| try: | |
| files = supabase.storage.from_(bucket_name).list("", { | |
| "limit": limit_per_page, | |
| "offset": offset | |
| }) | |
| if not files: | |
| break | |
| all_files.extend(files) | |
| offset += limit_per_page | |
| except Exception as e: | |
| print(f"Error fetching files with offset {offset}: {e}") | |
| break | |
| return all_files | |
| def get_data(): | |
| BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| data_dir = os.path.join(BASE_DIR, 'data') | |
| if not os.path.exists(data_dir): | |
| os.makedirs(data_dir) | |
| try: | |
| local_files = [f for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f))] | |
| except Exception as e: | |
| print(f"Error accessing local files: {e}") | |
| return | |
| try: | |
| remote_files_info = list_all_files("pnp-bot-storage") | |
| except Exception as e: | |
| print(f"Error fetching files from Supabase: {e}") | |
| return | |
| remote_files = [f["name"] for f in remote_files_info] | |
| # Sinkronisasi: hapus file yang tidak ada di storage | |
| file_to_delete = list(set(local_files) - set(remote_files)) | |
| file_to_download = list(set(remote_files) - set(local_files)) | |
| for filename in file_to_delete: | |
| try: | |
| os.remove(os.path.join(data_dir, filename)) | |
| print(f"Removed: {filename}") | |
| except Exception as e: | |
| print(f"Error removing {filename}: {e}") | |
| for filename in file_to_download: | |
| try: | |
| file_path = os.path.join(data_dir, filename) | |
| res = supabase.storage.from_("pnp-bot-storage").download(filename) | |
| with open(file_path, "wb") as f: | |
| f.write(res) | |
| print(f"Downloaded: {filename}") | |
| except Exception as e: | |
| print(f"Error downloading {filename}: {e}") | |
| def load_docs(): | |
| BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| data_dir = os.path.join(BASE_DIR, 'data') | |
| if not os.path.exists(data_dir): | |
| print(f"Directory not found: {data_dir}") | |
| os.makedirs(data_dir) | |
| print(f"Created directory: {data_dir}") | |
| return [] | |
| documents = [] | |
| try: | |
| files = os.listdir(data_dir) | |
| except PermissionError: | |
| print(f"Permission denied: {data_dir}") | |
| return [] | |
| for file in files: | |
| file_path = os.path.join(data_dir, file) | |
| if file.endswith(".pdf"): | |
| try: | |
| loader = PyPDFLoader(file_path) | |
| documents.extend(loader.load()) | |
| except Exception as e: | |
| print(f"Error loading PDF file {file}: {e}") | |
| elif file.endswith('.docx') or file.endswith('.doc'): | |
| try: | |
| loader = Docx2txtLoader(file_path) | |
| documents.extend(loader.load()) | |
| except Exception as e: | |
| print(f"Error loading DOCX/DOC file {file}: {e}") | |
| elif file.endswith('.txt'): | |
| try: | |
| loader = TextLoader(file_path, encoding='utf-8', autodetect_encoding=True) | |
| documents.extend(loader.load()) | |
| except Exception as e: | |
| print(f"Error loading TXT file {file}: {e}") | |
| return documents |