Spaces:
Sleeping
Sleeping
""" | |
Manage documents tab functionality for the Gradio app | |
""" | |
import gradio as gr | |
def manage_documents(global_vars): | |
"""Manage uploaded documents - view, delete individual or all documents""" | |
doc_ingestion = global_vars.get('doc_ingestion') | |
if not doc_ingestion: | |
return "β Please initialize systems first!", "", "" | |
try: | |
vectorstore = doc_ingestion.load_existing_vectorstore() | |
if not vectorstore: | |
return "β οΈ No documents found. Upload documents first.", "", "" | |
# Get all documents from vectorstore | |
collection = vectorstore._collection | |
all_docs = collection.get(include=["metadatas", "documents"]) | |
metadatas = all_docs["metadatas"] | |
ids = all_docs["ids"] | |
documents = all_docs["documents"] | |
# Group by file_id to show unique documents | |
doc_map = {} | |
for meta, doc_id, doc_text in zip(metadatas, ids, documents): | |
file_id = meta.get("file_id", doc_id) | |
if file_id not in doc_map: | |
doc_map[file_id] = { | |
"source": meta.get("source", "Unknown"), | |
"university": meta.get("university", "Unknown"), | |
"country": meta.get("country", "Unknown"), | |
"document_type": meta.get("document_type", "Unknown"), | |
"language": meta.get("language", "Unknown"), | |
"upload_timestamp": meta.get("upload_timestamp", "Unknown"), | |
"file_id": file_id, | |
"chunks": [] | |
} | |
doc_map[file_id]["chunks"].append(doc_text) | |
if not doc_map: | |
return "βΉοΈ No documents found in the system.", "", "" | |
# Create summary | |
total_documents = len(doc_map) | |
total_chunks = sum(len(info["chunks"]) for info in doc_map.values()) | |
summary = f"""## π Document Statistics | |
**Total Documents:** {total_documents} | |
**Total Text Chunks:** {total_chunks} | |
**Storage Status:** Active | |
## π Document List | |
""" | |
# Create document list with details | |
document_list = "" | |
file_id_list = [] | |
for i, (file_id, info) in enumerate(doc_map.items(), 1): | |
timestamp = info['upload_timestamp'][:19] if len(info['upload_timestamp']) > 19 else info['upload_timestamp'] | |
document_list += f""" | |
**{i}. {info['source']}** | |
- University: {info['university']} | |
- Country: {info['country']} | |
- Type: {info['document_type']} | |
- Language: {info['language']} | |
- Chunks: {len(info['chunks'])} | |
- Uploaded: {timestamp} | |
- File ID: `{file_id}` | |
--- | |
""" | |
file_id_list.append(file_id) | |
# Create dropdown options for individual deletion | |
file_options = [f"{info['source']} ({info['university']})" for info in doc_map.values()] | |
return summary, document_list, file_options | |
except Exception as e: | |
return f"β Error loading documents: {str(e)}", "", [] | |
def delete_document(selected_file, current_doc_list, global_vars): | |
"""Delete a specific document""" | |
doc_ingestion = global_vars.get('doc_ingestion') | |
if not doc_ingestion or not selected_file: | |
return "β Please select a document to delete.", current_doc_list | |
try: | |
vectorstore = doc_ingestion.load_existing_vectorstore() | |
if not vectorstore: | |
return "β No vectorstore found.", current_doc_list | |
# Get all documents and find the matching file_id | |
collection = vectorstore._collection | |
all_docs = collection.get(include=["metadatas"]) | |
metadatas = all_docs["metadatas"] | |
ids = all_docs["ids"] | |
# Find file_id for the selected document | |
target_file_id = None | |
for meta, doc_id in zip(metadatas, ids): | |
source = meta.get("source", "Unknown") | |
university = meta.get("university", "Unknown") | |
if f"{source} ({university})" == selected_file: | |
target_file_id = meta.get("file_id", doc_id) | |
break | |
if not target_file_id: | |
return "β Document not found.", current_doc_list | |
# Delete all chunks with this file_id | |
ids_to_delete = [doc_id for meta, doc_id in zip(metadatas, ids) if meta.get("file_id", doc_id) == target_file_id] | |
collection.delete(ids=ids_to_delete) | |
# Refresh the document list | |
_, new_doc_list, _ = manage_documents(global_vars) | |
return f"β Successfully deleted document: {selected_file}", new_doc_list | |
except Exception as e: | |
return f"β Error deleting document: {str(e)}", current_doc_list | |
def delete_all_documents(global_vars): | |
"""Delete all documents from the vectorstore""" | |
doc_ingestion = global_vars.get('doc_ingestion') | |
if not doc_ingestion: | |
return "β Please initialize systems first.", "" | |
try: | |
vectorstore_instance = doc_ingestion.load_existing_vectorstore() | |
if not vectorstore_instance: | |
return "β οΈ No documents found to delete.", "" | |
# Get all document IDs | |
collection = vectorstore_instance._collection | |
all_docs = collection.get() | |
all_ids = all_docs["ids"] | |
# Delete all documents | |
if all_ids: | |
collection.delete(ids=all_ids) | |
# Clear global vectorstore | |
global_vars['vectorstore'] = None | |
return f"β Successfully deleted all {len(all_ids)} document chunks.", "" | |
else: | |
return "βΉοΈ No documents found to delete.", "" | |
except Exception as e: | |
return f"β Error deleting all documents: {str(e)}", "" | |
def create_manage_tab(global_vars): | |
"""Create the Manage Documents tab""" | |
with gr.Tab("π Manage Documents", id="manage"): | |
gr.Markdown(""" | |
### Step 4: Manage Your Documents | |
View, inspect, and manage all uploaded documents in your knowledge base. | |
You can see document details and delete individual documents or all documents. | |
""") | |
# Buttons for actions | |
with gr.Row(): | |
refresh_btn = gr.Button("π Refresh Document List", variant="secondary") | |
delete_all_btn = gr.Button("ποΈ Delete All Documents", variant="stop") | |
# Document statistics and list | |
doc_summary = gr.Markdown( | |
value="π Click 'Refresh Document List' to view your documents.", | |
label="Document Summary" | |
) | |
doc_list = gr.Markdown( | |
value="π Document details will appear here after refresh.", | |
label="Document List" | |
) | |
# Individual document deletion | |
gr.Markdown("### ποΈ Delete Individual Document") | |
with gr.Row(): | |
file_selector = gr.Dropdown( | |
choices=[], | |
label="Select Document to Delete", | |
interactive=True, | |
info="First click 'Refresh Document List' to see available documents" | |
) | |
delete_single_btn = gr.Button("ποΈ Delete Selected", variant="stop") | |
delete_status = gr.Textbox( | |
label="Action Status", | |
interactive=False, | |
lines=2, | |
placeholder="Deletion status will appear here..." | |
) | |
# Event handlers | |
def refresh_documents(): | |
summary, documents, file_options = manage_documents(global_vars) | |
# Update dropdown choices | |
return summary, documents, gr.Dropdown(choices=file_options, value=None) | |
def delete_selected_document(selected_file, current_list): | |
if not selected_file: | |
return "β Please select a document to delete first.", current_list, gr.Dropdown(choices=[]) | |
status, new_list = delete_document(selected_file, current_list, global_vars) | |
# Also refresh the file options after deletion | |
_, _, new_options = manage_documents(global_vars) | |
return status, new_list, gr.Dropdown(choices=new_options, value=None) | |
def delete_all_docs(): | |
status, empty_list = delete_all_documents(global_vars) | |
return status, "π No documents in the system.", gr.Dropdown(choices=[], value=None) | |
# Connect event handlers | |
refresh_btn.click( | |
refresh_documents, | |
outputs=[doc_summary, doc_list, file_selector] | |
) | |
delete_single_btn.click( | |
delete_selected_document, | |
inputs=[file_selector, doc_list], | |
outputs=[delete_status, doc_list, file_selector] | |
) | |
delete_all_btn.click( | |
delete_all_docs, | |
outputs=[delete_status, doc_list, file_selector] | |
) | |