Spaces:
Sleeping
Sleeping
File size: 9,029 Bytes
102c695 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 |
"""
Manage documents tab functionality for the Gradio app
"""
import gradio as gr
def manage_documents(global_vars):
"""Manage uploaded documents - view, delete individual or all documents"""
doc_ingestion = global_vars.get('doc_ingestion')
if not doc_ingestion:
return "β Please initialize systems first!", "", ""
try:
vectorstore = doc_ingestion.load_existing_vectorstore()
if not vectorstore:
return "β οΈ No documents found. Upload documents first.", "", ""
# Get all documents from vectorstore
collection = vectorstore._collection
all_docs = collection.get(include=["metadatas", "documents"])
metadatas = all_docs["metadatas"]
ids = all_docs["ids"]
documents = all_docs["documents"]
# Group by file_id to show unique documents
doc_map = {}
for meta, doc_id, doc_text in zip(metadatas, ids, documents):
file_id = meta.get("file_id", doc_id)
if file_id not in doc_map:
doc_map[file_id] = {
"source": meta.get("source", "Unknown"),
"university": meta.get("university", "Unknown"),
"country": meta.get("country", "Unknown"),
"document_type": meta.get("document_type", "Unknown"),
"language": meta.get("language", "Unknown"),
"upload_timestamp": meta.get("upload_timestamp", "Unknown"),
"file_id": file_id,
"chunks": []
}
doc_map[file_id]["chunks"].append(doc_text)
if not doc_map:
return "βΉοΈ No documents found in the system.", "", ""
# Create summary
total_documents = len(doc_map)
total_chunks = sum(len(info["chunks"]) for info in doc_map.values())
summary = f"""## π Document Statistics
**Total Documents:** {total_documents}
**Total Text Chunks:** {total_chunks}
**Storage Status:** Active
## π Document List
"""
# Create document list with details
document_list = ""
file_id_list = []
for i, (file_id, info) in enumerate(doc_map.items(), 1):
timestamp = info['upload_timestamp'][:19] if len(info['upload_timestamp']) > 19 else info['upload_timestamp']
document_list += f"""
**{i}. {info['source']}**
- University: {info['university']}
- Country: {info['country']}
- Type: {info['document_type']}
- Language: {info['language']}
- Chunks: {len(info['chunks'])}
- Uploaded: {timestamp}
- File ID: `{file_id}`
---
"""
file_id_list.append(file_id)
# Create dropdown options for individual deletion
file_options = [f"{info['source']} ({info['university']})" for info in doc_map.values()]
return summary, document_list, file_options
except Exception as e:
return f"β Error loading documents: {str(e)}", "", []
def delete_document(selected_file, current_doc_list, global_vars):
"""Delete a specific document"""
doc_ingestion = global_vars.get('doc_ingestion')
if not doc_ingestion or not selected_file:
return "β Please select a document to delete.", current_doc_list
try:
vectorstore = doc_ingestion.load_existing_vectorstore()
if not vectorstore:
return "β No vectorstore found.", current_doc_list
# Get all documents and find the matching file_id
collection = vectorstore._collection
all_docs = collection.get(include=["metadatas"])
metadatas = all_docs["metadatas"]
ids = all_docs["ids"]
# Find file_id for the selected document
target_file_id = None
for meta, doc_id in zip(metadatas, ids):
source = meta.get("source", "Unknown")
university = meta.get("university", "Unknown")
if f"{source} ({university})" == selected_file:
target_file_id = meta.get("file_id", doc_id)
break
if not target_file_id:
return "β Document not found.", current_doc_list
# Delete all chunks with this file_id
ids_to_delete = [doc_id for meta, doc_id in zip(metadatas, ids) if meta.get("file_id", doc_id) == target_file_id]
collection.delete(ids=ids_to_delete)
# Refresh the document list
_, new_doc_list, _ = manage_documents(global_vars)
return f"β
Successfully deleted document: {selected_file}", new_doc_list
except Exception as e:
return f"β Error deleting document: {str(e)}", current_doc_list
def delete_all_documents(global_vars):
"""Delete all documents from the vectorstore"""
doc_ingestion = global_vars.get('doc_ingestion')
if not doc_ingestion:
return "β Please initialize systems first.", ""
try:
vectorstore_instance = doc_ingestion.load_existing_vectorstore()
if not vectorstore_instance:
return "β οΈ No documents found to delete.", ""
# Get all document IDs
collection = vectorstore_instance._collection
all_docs = collection.get()
all_ids = all_docs["ids"]
# Delete all documents
if all_ids:
collection.delete(ids=all_ids)
# Clear global vectorstore
global_vars['vectorstore'] = None
return f"β
Successfully deleted all {len(all_ids)} document chunks.", ""
else:
return "βΉοΈ No documents found to delete.", ""
except Exception as e:
return f"β Error deleting all documents: {str(e)}", ""
def create_manage_tab(global_vars):
"""Create the Manage Documents tab"""
with gr.Tab("π Manage Documents", id="manage"):
gr.Markdown("""
### Step 4: Manage Your Documents
View, inspect, and manage all uploaded documents in your knowledge base.
You can see document details and delete individual documents or all documents.
""")
# Buttons for actions
with gr.Row():
refresh_btn = gr.Button("π Refresh Document List", variant="secondary")
delete_all_btn = gr.Button("ποΈ Delete All Documents", variant="stop")
# Document statistics and list
doc_summary = gr.Markdown(
value="π Click 'Refresh Document List' to view your documents.",
label="Document Summary"
)
doc_list = gr.Markdown(
value="π Document details will appear here after refresh.",
label="Document List"
)
# Individual document deletion
gr.Markdown("### ποΈ Delete Individual Document")
with gr.Row():
file_selector = gr.Dropdown(
choices=[],
label="Select Document to Delete",
interactive=True,
info="First click 'Refresh Document List' to see available documents"
)
delete_single_btn = gr.Button("ποΈ Delete Selected", variant="stop")
delete_status = gr.Textbox(
label="Action Status",
interactive=False,
lines=2,
placeholder="Deletion status will appear here..."
)
# Event handlers
def refresh_documents():
summary, documents, file_options = manage_documents(global_vars)
# Update dropdown choices
return summary, documents, gr.Dropdown(choices=file_options, value=None)
def delete_selected_document(selected_file, current_list):
if not selected_file:
return "β Please select a document to delete first.", current_list, gr.Dropdown(choices=[])
status, new_list = delete_document(selected_file, current_list, global_vars)
# Also refresh the file options after deletion
_, _, new_options = manage_documents(global_vars)
return status, new_list, gr.Dropdown(choices=new_options, value=None)
def delete_all_docs():
status, empty_list = delete_all_documents(global_vars)
return status, "π No documents in the system.", gr.Dropdown(choices=[], value=None)
# Connect event handlers
refresh_btn.click(
refresh_documents,
outputs=[doc_summary, doc_list, file_selector]
)
delete_single_btn.click(
delete_selected_document,
inputs=[file_selector, doc_list],
outputs=[delete_status, doc_list, file_selector]
)
delete_all_btn.click(
delete_all_docs,
outputs=[delete_status, doc_list, file_selector]
)
|