|
""" |
|
Manage documents tab functionality for the Gradio app |
|
""" |
|
import gradio as gr |
|
|
|
def manage_documents(global_vars): |
|
"""Manage uploaded documents - view, delete individual or all documents""" |
|
doc_ingestion = global_vars.get('doc_ingestion') |
|
|
|
if not doc_ingestion: |
|
return "β Please initialize systems first!", "", "" |
|
|
|
try: |
|
vectorstore = doc_ingestion.load_existing_vectorstore() |
|
|
|
if not vectorstore: |
|
return "β οΈ No documents found. Upload documents first.", "", "" |
|
|
|
|
|
collection = vectorstore._collection |
|
all_docs = collection.get(include=["metadatas", "documents"]) |
|
metadatas = all_docs["metadatas"] |
|
ids = all_docs["ids"] |
|
documents = all_docs["documents"] |
|
|
|
|
|
doc_map = {} |
|
for meta, doc_id, doc_text in zip(metadatas, ids, documents): |
|
file_id = meta.get("file_id", doc_id) |
|
if file_id not in doc_map: |
|
doc_map[file_id] = { |
|
"source": meta.get("source", "Unknown"), |
|
"university": meta.get("university", "Unknown"), |
|
"country": meta.get("country", "Unknown"), |
|
"document_type": meta.get("document_type", "Unknown"), |
|
"language": meta.get("language", "Unknown"), |
|
"upload_timestamp": meta.get("upload_timestamp", "Unknown"), |
|
"file_id": file_id, |
|
"chunks": [] |
|
} |
|
doc_map[file_id]["chunks"].append(doc_text) |
|
|
|
if not doc_map: |
|
return "βΉοΈ No documents found in the system.", "", "" |
|
|
|
|
|
total_documents = len(doc_map) |
|
total_chunks = sum(len(info["chunks"]) for info in doc_map.values()) |
|
|
|
summary = f"""## π Document Statistics |
|
|
|
**Total Documents:** {total_documents} |
|
**Total Text Chunks:** {total_chunks} |
|
**Storage Status:** Active |
|
|
|
## π Document List |
|
""" |
|
|
|
|
|
document_list = "" |
|
file_id_list = [] |
|
|
|
for i, (file_id, info) in enumerate(doc_map.items(), 1): |
|
timestamp = info['upload_timestamp'][:19] if len(info['upload_timestamp']) > 19 else info['upload_timestamp'] |
|
|
|
document_list += f""" |
|
**{i}. {info['source']}** |
|
- University: {info['university']} |
|
- Country: {info['country']} |
|
- Type: {info['document_type']} |
|
- Language: {info['language']} |
|
- Chunks: {len(info['chunks'])} |
|
- Uploaded: {timestamp} |
|
- File ID: `{file_id}` |
|
|
|
--- |
|
""" |
|
file_id_list.append(file_id) |
|
|
|
|
|
file_options = [f"{info['source']} ({info['university']})" for info in doc_map.values()] |
|
|
|
return summary, document_list, file_options |
|
|
|
except Exception as e: |
|
return f"β Error loading documents: {str(e)}", "", [] |
|
|
|
def delete_document(selected_file, current_doc_list, global_vars): |
|
"""Delete a specific document""" |
|
doc_ingestion = global_vars.get('doc_ingestion') |
|
|
|
if not doc_ingestion or not selected_file: |
|
return "β Please select a document to delete.", current_doc_list |
|
|
|
try: |
|
vectorstore = doc_ingestion.load_existing_vectorstore() |
|
if not vectorstore: |
|
return "β No vectorstore found.", current_doc_list |
|
|
|
|
|
collection = vectorstore._collection |
|
all_docs = collection.get(include=["metadatas"]) |
|
metadatas = all_docs["metadatas"] |
|
ids = all_docs["ids"] |
|
|
|
|
|
target_file_id = None |
|
for meta, doc_id in zip(metadatas, ids): |
|
source = meta.get("source", "Unknown") |
|
university = meta.get("university", "Unknown") |
|
if f"{source} ({university})" == selected_file: |
|
target_file_id = meta.get("file_id", doc_id) |
|
break |
|
|
|
if not target_file_id: |
|
return "β Document not found.", current_doc_list |
|
|
|
|
|
ids_to_delete = [doc_id for meta, doc_id in zip(metadatas, ids) if meta.get("file_id", doc_id) == target_file_id] |
|
collection.delete(ids=ids_to_delete) |
|
|
|
|
|
_, new_doc_list, _ = manage_documents(global_vars) |
|
|
|
return f"β
Successfully deleted document: {selected_file}", new_doc_list |
|
|
|
except Exception as e: |
|
return f"β Error deleting document: {str(e)}", current_doc_list |
|
|
|
def delete_all_documents(global_vars): |
|
"""Delete all documents from the vectorstore""" |
|
doc_ingestion = global_vars.get('doc_ingestion') |
|
|
|
if not doc_ingestion: |
|
return "β Please initialize systems first.", "" |
|
|
|
try: |
|
vectorstore_instance = doc_ingestion.load_existing_vectorstore() |
|
if not vectorstore_instance: |
|
return "β οΈ No documents found to delete.", "" |
|
|
|
|
|
collection = vectorstore_instance._collection |
|
all_docs = collection.get() |
|
all_ids = all_docs["ids"] |
|
|
|
|
|
if all_ids: |
|
collection.delete(ids=all_ids) |
|
|
|
global_vars['vectorstore'] = None |
|
return f"β
Successfully deleted all {len(all_ids)} document chunks.", "" |
|
else: |
|
return "βΉοΈ No documents found to delete.", "" |
|
|
|
except Exception as e: |
|
return f"β Error deleting all documents: {str(e)}", "" |
|
|
|
def create_manage_tab(global_vars): |
|
"""Create the Manage Documents tab""" |
|
with gr.Tab("π Manage Documents", id="manage"): |
|
gr.Markdown(""" |
|
### Step 4: Manage Your Documents |
|
View, inspect, and manage all uploaded documents in your knowledge base. |
|
You can see document details and delete individual documents or all documents. |
|
""") |
|
|
|
|
|
with gr.Row(): |
|
refresh_btn = gr.Button("π Refresh Document List", variant="secondary") |
|
delete_all_btn = gr.Button("ποΈ Delete All Documents", variant="stop") |
|
|
|
|
|
doc_summary = gr.Markdown( |
|
value="π Click 'Refresh Document List' to view your documents.", |
|
label="Document Summary" |
|
) |
|
|
|
doc_list = gr.Markdown( |
|
value="π Document details will appear here after refresh.", |
|
label="Document List" |
|
) |
|
|
|
|
|
gr.Markdown("### ποΈ Delete Individual Document") |
|
|
|
with gr.Row(): |
|
file_selector = gr.Dropdown( |
|
choices=[], |
|
label="Select Document to Delete", |
|
interactive=True, |
|
info="First click 'Refresh Document List' to see available documents" |
|
) |
|
delete_single_btn = gr.Button("ποΈ Delete Selected", variant="stop") |
|
|
|
delete_status = gr.Textbox( |
|
label="Action Status", |
|
interactive=False, |
|
lines=2, |
|
placeholder="Deletion status will appear here..." |
|
) |
|
|
|
|
|
def refresh_documents(): |
|
summary, documents, file_options = manage_documents(global_vars) |
|
|
|
return summary, documents, gr.Dropdown(choices=file_options, value=None) |
|
|
|
def delete_selected_document(selected_file, current_list): |
|
if not selected_file: |
|
return "β Please select a document to delete first.", current_list, gr.Dropdown(choices=[]) |
|
|
|
status, new_list = delete_document(selected_file, current_list, global_vars) |
|
|
|
_, _, new_options = manage_documents(global_vars) |
|
return status, new_list, gr.Dropdown(choices=new_options, value=None) |
|
|
|
def delete_all_docs(): |
|
status, empty_list = delete_all_documents(global_vars) |
|
return status, "π No documents in the system.", gr.Dropdown(choices=[], value=None) |
|
|
|
|
|
refresh_btn.click( |
|
refresh_documents, |
|
outputs=[doc_summary, doc_list, file_selector] |
|
) |
|
|
|
delete_single_btn.click( |
|
delete_selected_document, |
|
inputs=[file_selector, doc_list], |
|
outputs=[delete_status, doc_list, file_selector] |
|
) |
|
|
|
delete_all_btn.click( |
|
delete_all_docs, |
|
outputs=[delete_status, doc_list, file_selector] |
|
) |
|
|