""" Manage documents tab functionality for the Gradio app """ import gradio as gr def manage_documents(global_vars): """Manage uploaded documents - view, delete individual or all documents""" doc_ingestion = global_vars.get('doc_ingestion') if not doc_ingestion: return "❌ Please initialize systems first!", "", "" try: vectorstore = doc_ingestion.load_existing_vectorstore() if not vectorstore: return "⚠️ No documents found. Upload documents first.", "", "" # Get all documents from vectorstore collection = vectorstore._collection all_docs = collection.get(include=["metadatas", "documents"]) metadatas = all_docs["metadatas"] ids = all_docs["ids"] documents = all_docs["documents"] # Group by file_id to show unique documents doc_map = {} for meta, doc_id, doc_text in zip(metadatas, ids, documents): file_id = meta.get("file_id", doc_id) if file_id not in doc_map: doc_map[file_id] = { "source": meta.get("source", "Unknown"), "university": meta.get("university", "Unknown"), "country": meta.get("country", "Unknown"), "document_type": meta.get("document_type", "Unknown"), "language": meta.get("language", "Unknown"), "upload_timestamp": meta.get("upload_timestamp", "Unknown"), "file_id": file_id, "chunks": [] } doc_map[file_id]["chunks"].append(doc_text) if not doc_map: return "ℹ️ No documents found in the system.", "", "" # Create summary total_documents = len(doc_map) total_chunks = sum(len(info["chunks"]) for info in doc_map.values()) summary = f"""## 📊 Document Statistics **Total Documents:** {total_documents} **Total Text Chunks:** {total_chunks} **Storage Status:** Active ## 📚 Document List """ # Create document list with details document_list = "" file_id_list = [] for i, (file_id, info) in enumerate(doc_map.items(), 1): timestamp = info['upload_timestamp'][:19] if len(info['upload_timestamp']) > 19 else info['upload_timestamp'] document_list += f""" **{i}. {info['source']}** - University: {info['university']} - Country: {info['country']} - Type: {info['document_type']} - Language: {info['language']} - Chunks: {len(info['chunks'])} - Uploaded: {timestamp} - File ID: `{file_id}` --- """ file_id_list.append(file_id) # Create dropdown options for individual deletion file_options = [f"{info['source']} ({info['university']})" for info in doc_map.values()] return summary, document_list, file_options except Exception as e: return f"❌ Error loading documents: {str(e)}", "", [] def delete_document(selected_file, current_doc_list, global_vars): """Delete a specific document""" doc_ingestion = global_vars.get('doc_ingestion') if not doc_ingestion or not selected_file: return "❌ Please select a document to delete.", current_doc_list try: vectorstore = doc_ingestion.load_existing_vectorstore() if not vectorstore: return "❌ No vectorstore found.", current_doc_list # Get all documents and find the matching file_id collection = vectorstore._collection all_docs = collection.get(include=["metadatas"]) metadatas = all_docs["metadatas"] ids = all_docs["ids"] # Find file_id for the selected document target_file_id = None for meta, doc_id in zip(metadatas, ids): source = meta.get("source", "Unknown") university = meta.get("university", "Unknown") if f"{source} ({university})" == selected_file: target_file_id = meta.get("file_id", doc_id) break if not target_file_id: return "❌ Document not found.", current_doc_list # Delete all chunks with this file_id ids_to_delete = [doc_id for meta, doc_id in zip(metadatas, ids) if meta.get("file_id", doc_id) == target_file_id] collection.delete(ids=ids_to_delete) # Refresh the document list _, new_doc_list, _ = manage_documents(global_vars) return f"✅ Successfully deleted document: {selected_file}", new_doc_list except Exception as e: return f"❌ Error deleting document: {str(e)}", current_doc_list def delete_all_documents(global_vars): """Delete all documents from the vectorstore""" doc_ingestion = global_vars.get('doc_ingestion') if not doc_ingestion: return "❌ Please initialize systems first.", "" try: vectorstore_instance = doc_ingestion.load_existing_vectorstore() if not vectorstore_instance: return "⚠️ No documents found to delete.", "" # Get all document IDs collection = vectorstore_instance._collection all_docs = collection.get() all_ids = all_docs["ids"] # Delete all documents if all_ids: collection.delete(ids=all_ids) # Clear global vectorstore global_vars['vectorstore'] = None return f"✅ Successfully deleted all {len(all_ids)} document chunks.", "" else: return "ℹ️ No documents found to delete.", "" except Exception as e: return f"❌ Error deleting all documents: {str(e)}", "" def create_manage_tab(global_vars): """Create the Manage Documents tab""" with gr.Tab("🗂 Manage Documents", id="manage"): gr.Markdown(""" ### Step 4: Manage Your Documents View, inspect, and manage all uploaded documents in your knowledge base. You can see document details and delete individual documents or all documents. """) # Buttons for actions with gr.Row(): refresh_btn = gr.Button("🔄 Refresh Document List", variant="secondary") delete_all_btn = gr.Button("🗑️ Delete All Documents", variant="stop") # Document statistics and list doc_summary = gr.Markdown( value="📊 Click 'Refresh Document List' to view your documents.", label="Document Summary" ) doc_list = gr.Markdown( value="📚 Document details will appear here after refresh.", label="Document List" ) # Individual document deletion gr.Markdown("### 🗑️ Delete Individual Document") with gr.Row(): file_selector = gr.Dropdown( choices=[], label="Select Document to Delete", interactive=True, info="First click 'Refresh Document List' to see available documents" ) delete_single_btn = gr.Button("🗑️ Delete Selected", variant="stop") delete_status = gr.Textbox( label="Action Status", interactive=False, lines=2, placeholder="Deletion status will appear here..." ) # Event handlers def refresh_documents(): summary, documents, file_options = manage_documents(global_vars) # Update dropdown choices return summary, documents, gr.Dropdown(choices=file_options, value=None) def delete_selected_document(selected_file, current_list): if not selected_file: return "❌ Please select a document to delete first.", current_list, gr.Dropdown(choices=[]) status, new_list = delete_document(selected_file, current_list, global_vars) # Also refresh the file options after deletion _, _, new_options = manage_documents(global_vars) return status, new_list, gr.Dropdown(choices=new_options, value=None) def delete_all_docs(): status, empty_list = delete_all_documents(global_vars) return status, "📚 No documents in the system.", gr.Dropdown(choices=[], value=None) # Connect event handlers refresh_btn.click( refresh_documents, outputs=[doc_summary, doc_list, file_selector] ) delete_single_btn.click( delete_selected_document, inputs=[file_selector, doc_list], outputs=[delete_status, doc_list, file_selector] ) delete_all_btn.click( delete_all_docs, outputs=[delete_status, doc_list, file_selector] )