TopEdu / tabs /manage.py
Ervinoreo
gradio
846f122
"""
Manage documents tab functionality for the Gradio app
"""
import gradio as gr
def manage_documents(global_vars):
"""Manage uploaded documents - view, delete individual or all documents"""
doc_ingestion = global_vars.get('doc_ingestion')
if not doc_ingestion:
return "❌ Please initialize systems first!", "", ""
try:
vectorstore = doc_ingestion.load_existing_vectorstore()
if not vectorstore:
return "⚠️ No documents found. Upload documents first.", "", ""
# Get all documents from vectorstore
collection = vectorstore._collection
all_docs = collection.get(include=["metadatas", "documents"])
metadatas = all_docs["metadatas"]
ids = all_docs["ids"]
documents = all_docs["documents"]
# Group by file_id to show unique documents
doc_map = {}
for meta, doc_id, doc_text in zip(metadatas, ids, documents):
file_id = meta.get("file_id", doc_id)
if file_id not in doc_map:
doc_map[file_id] = {
"source": meta.get("source", "Unknown"),
"university": meta.get("university", "Unknown"),
"country": meta.get("country", "Unknown"),
"document_type": meta.get("document_type", "Unknown"),
"language": meta.get("language", "Unknown"),
"upload_timestamp": meta.get("upload_timestamp", "Unknown"),
"file_id": file_id,
"chunks": []
}
doc_map[file_id]["chunks"].append(doc_text)
if not doc_map:
return "ℹ️ No documents found in the system.", "", ""
# Create summary
total_documents = len(doc_map)
total_chunks = sum(len(info["chunks"]) for info in doc_map.values())
summary = f"""## πŸ“Š Document Statistics
**Total Documents:** {total_documents}
**Total Text Chunks:** {total_chunks}
**Storage Status:** Active
## πŸ“š Document List
"""
# Create document list with details
document_list = ""
file_id_list = []
for i, (file_id, info) in enumerate(doc_map.items(), 1):
timestamp = info['upload_timestamp'][:19] if len(info['upload_timestamp']) > 19 else info['upload_timestamp']
document_list += f"""
**{i}. {info['source']}**
- University: {info['university']}
- Country: {info['country']}
- Type: {info['document_type']}
- Language: {info['language']}
- Chunks: {len(info['chunks'])}
- Uploaded: {timestamp}
- File ID: `{file_id}`
---
"""
file_id_list.append(file_id)
# Create dropdown options for individual deletion
file_options = [f"{info['source']} ({info['university']})" for info in doc_map.values()]
return summary, document_list, file_options
except Exception as e:
return f"❌ Error loading documents: {str(e)}", "", []
def delete_document(selected_file, current_doc_list, global_vars):
"""Delete a specific document"""
doc_ingestion = global_vars.get('doc_ingestion')
if not doc_ingestion or not selected_file:
return "❌ Please select a document to delete.", current_doc_list
try:
vectorstore = doc_ingestion.load_existing_vectorstore()
if not vectorstore:
return "❌ No vectorstore found.", current_doc_list
# Get all documents and find the matching file_id
collection = vectorstore._collection
all_docs = collection.get(include=["metadatas"])
metadatas = all_docs["metadatas"]
ids = all_docs["ids"]
# Find file_id for the selected document
target_file_id = None
for meta, doc_id in zip(metadatas, ids):
source = meta.get("source", "Unknown")
university = meta.get("university", "Unknown")
if f"{source} ({university})" == selected_file:
target_file_id = meta.get("file_id", doc_id)
break
if not target_file_id:
return "❌ Document not found.", current_doc_list
# Delete all chunks with this file_id
ids_to_delete = [doc_id for meta, doc_id in zip(metadatas, ids) if meta.get("file_id", doc_id) == target_file_id]
collection.delete(ids=ids_to_delete)
# Refresh the document list
_, new_doc_list, _ = manage_documents(global_vars)
return f"βœ… Successfully deleted document: {selected_file}", new_doc_list
except Exception as e:
return f"❌ Error deleting document: {str(e)}", current_doc_list
def delete_all_documents(global_vars):
"""Delete all documents from the vectorstore"""
doc_ingestion = global_vars.get('doc_ingestion')
if not doc_ingestion:
return "❌ Please initialize systems first.", ""
try:
vectorstore_instance = doc_ingestion.load_existing_vectorstore()
if not vectorstore_instance:
return "⚠️ No documents found to delete.", ""
# Get all document IDs
collection = vectorstore_instance._collection
all_docs = collection.get()
all_ids = all_docs["ids"]
# Delete all documents
if all_ids:
collection.delete(ids=all_ids)
# Clear global vectorstore
global_vars['vectorstore'] = None
return f"βœ… Successfully deleted all {len(all_ids)} document chunks.", ""
else:
return "ℹ️ No documents found to delete.", ""
except Exception as e:
return f"❌ Error deleting all documents: {str(e)}", ""
def create_manage_tab(global_vars):
"""Create the Manage Documents tab"""
with gr.Tab("πŸ—‚ Manage Documents", id="manage"):
gr.Markdown("""
### Step 4: Manage Your Documents
View, inspect, and manage all uploaded documents in your knowledge base.
You can see document details and delete individual documents or all documents.
""")
# Buttons for actions
with gr.Row():
refresh_btn = gr.Button("πŸ”„ Refresh Document List", variant="secondary")
delete_all_btn = gr.Button("πŸ—‘οΈ Delete All Documents", variant="stop")
# Document statistics and list
doc_summary = gr.Markdown(
value="πŸ“Š Click 'Refresh Document List' to view your documents.",
label="Document Summary"
)
doc_list = gr.Markdown(
value="πŸ“š Document details will appear here after refresh.",
label="Document List"
)
# Individual document deletion
gr.Markdown("### πŸ—‘οΈ Delete Individual Document")
with gr.Row():
file_selector = gr.Dropdown(
choices=[],
label="Select Document to Delete",
interactive=True,
info="First click 'Refresh Document List' to see available documents"
)
delete_single_btn = gr.Button("πŸ—‘οΈ Delete Selected", variant="stop")
delete_status = gr.Textbox(
label="Action Status",
interactive=False,
lines=2,
placeholder="Deletion status will appear here..."
)
# Event handlers
def refresh_documents():
summary, documents, file_options = manage_documents(global_vars)
# Update dropdown choices
return summary, documents, gr.Dropdown(choices=file_options, value=None)
def delete_selected_document(selected_file, current_list):
if not selected_file:
return "❌ Please select a document to delete first.", current_list, gr.Dropdown(choices=[])
status, new_list = delete_document(selected_file, current_list, global_vars)
# Also refresh the file options after deletion
_, _, new_options = manage_documents(global_vars)
return status, new_list, gr.Dropdown(choices=new_options, value=None)
def delete_all_docs():
status, empty_list = delete_all_documents(global_vars)
return status, "πŸ“š No documents in the system.", gr.Dropdown(choices=[], value=None)
# Connect event handlers
refresh_btn.click(
refresh_documents,
outputs=[doc_summary, doc_list, file_selector]
)
delete_single_btn.click(
delete_selected_document,
inputs=[file_selector, doc_list],
outputs=[delete_status, doc_list, file_selector]
)
delete_all_btn.click(
delete_all_docs,
outputs=[delete_status, doc_list, file_selector]
)