Spaces:

Yozora721
/

pnp-chatbot-v1

Sleeping

pnp-chatbot-v1 / app /data_loader.py

FauziIsyrinApridal

revisi 1

daa81fb 3 months ago

3.54 kB

	import os
	import streamlit as st
	from app.db import supabase
	from langchain_community.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader

	@st.cache_data(ttl=60, show_spinner=False)
	def list_all_files(bucket_name, limit_per_page=1000):
	all_files = []
	offset = 0

	while True:
	try:
	files = supabase.storage.from_(bucket_name).list("", {
	"limit": limit_per_page,
	"offset": offset
	})
	if not files:
	break
	all_files.extend(files)
	offset += limit_per_page
	except Exception as e:
	print(f"Error fetching files with offset {offset}: {e}")
	break

	return all_files


	def get_data():
	BASE_DIR = os.path.dirname(os.path.abspath(__file__))
	data_dir = os.path.join(BASE_DIR, 'data')

	if not os.path.exists(data_dir):
	os.makedirs(data_dir)

	try:
	local_files = [f for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f))]
	except Exception as e:
	print(f"Error accessing local files: {e}")
	return

	try:
	remote_files_info = list_all_files("pnp-bot-storage")
	except Exception as e:
	print(f"Error fetching files from Supabase: {e}")
	return

	remote_files = [f["name"] for f in remote_files_info]

	# Sinkronisasi: hapus file yang tidak ada di storage
	file_to_delete = list(set(local_files) - set(remote_files))
	file_to_download = list(set(remote_files) - set(local_files))

	for filename in file_to_delete:
	try:
	os.remove(os.path.join(data_dir, filename))
	print(f"Removed: {filename}")
	except Exception as e:
	print(f"Error removing {filename}: {e}")

	for filename in file_to_download:
	try:
	file_path = os.path.join(data_dir, filename)
	res = supabase.storage.from_("pnp-bot-storage").download(filename)
	with open(file_path, "wb") as f:
	f.write(res)
	print(f"Downloaded: {filename}")
	except Exception as e:
	print(f"Error downloading {filename}: {e}")


	def load_docs():
	BASE_DIR = os.path.dirname(os.path.abspath(__file__))
	data_dir = os.path.join(BASE_DIR, 'data')

	if not os.path.exists(data_dir):
	print(f"Directory not found: {data_dir}")
	os.makedirs(data_dir)
	print(f"Created directory: {data_dir}")
	return []

	documents = []

	try:
	files = os.listdir(data_dir)
	except PermissionError:
	print(f"Permission denied: {data_dir}")
	return []

	for file in files:
	file_path = os.path.join(data_dir, file)
	if file.endswith(".pdf"):
	try:
	loader = PyPDFLoader(file_path)
	documents.extend(loader.load())
	except Exception as e:
	print(f"Error loading PDF file {file}: {e}")
	elif file.endswith('.docx') or file.endswith('.doc'):
	try:
	loader = Docx2txtLoader(file_path)
	documents.extend(loader.load())
	except Exception as e:
	print(f"Error loading DOCX/DOC file {file}: {e}")
	elif file.endswith('.txt'):
	try:
	loader = TextLoader(file_path, encoding='utf-8', autodetect_encoding=True)
	documents.extend(loader.load())
	except Exception as e:
	print(f"Error loading TXT file {file}: {e}")

	return documents