Spaces:
Runtime error
Runtime error
| import os | |
| import pickle | |
| import faiss | |
| import numpy as np | |
| from pathlib import Path | |
| from tqdm import tqdm | |
| from docling.document_converter import DocumentConverter | |
| from docling.chunking import HybridChunker | |
| from llama_index.core.schema import TextNode | |
| from llama_index.vector_stores.faiss import FaissVectorStore | |
| from llama_index.embeddings.huggingface import HuggingFaceEmbedding | |
| from llama_index.core import VectorStoreIndex | |
| # 📁 Paramètres | |
| DOCS_DIR = "data" | |
| VECTOR_DIR = "vectordb_docling" | |
| INDEX_FILE = os.path.join(VECTOR_DIR, "index.faiss") | |
| CHUNKS_FILE = os.path.join(VECTOR_DIR, "chunks.pkl") | |
| EMBEDDING_MODEL = "intfloat/multilingual-e5-base" | |
| os.makedirs(VECTOR_DIR, exist_ok=True) | |
| # 📥 Conversion avec Docling | |
| print("📥 Conversion des documents avec Docling...") | |
| converter = DocumentConverter() | |
| dl_docs = [] | |
| for pdf_path in Path(DOCS_DIR).glob("*.pdf"): | |
| print(f" - 📄 {pdf_path.name}") | |
| docling_doc = converter.convert(str(pdf_path)).document | |
| dl_docs.append(docling_doc) | |
| # ✂️ Chunking sémantique via HybridChunker | |
| print("✂️ Chunking intelligent avec HybridChunker (Docling)...") | |
| chunker = HybridChunker() | |
| text_nodes = [] | |
| for dl_doc in dl_docs: | |
| chunks = chunker.chunk(dl_doc=dl_doc) | |
| for chunk in chunks: | |
| text_nodes.append(TextNode(text=chunk.text)) | |
| print(f"✅ {len(text_nodes)} chunks générés.") | |
| # 🔢 Embedding + FAISS index | |
| print("🔢 Génération des embeddings et indexation FAISS...") | |
| embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL) | |
| embedding_dim = np.array(embed_model.get_query_embedding("test")).shape[0] | |
| faiss_index = faiss.IndexFlatL2(embedding_dim) | |
| vector_store = FaissVectorStore(faiss_index=faiss_index) | |
| # 🧠 Construction de l’index vectoriel | |
| index = VectorStoreIndex(text_nodes, embed_model=embed_model, vector_store=vector_store) | |
| # 💾 Sauvegarde | |
| print("💾 Sauvegarde de l’index et des chunks...") | |
| faiss.write_index(faiss_index, INDEX_FILE) | |
| chunks = [node.get_content() for node in text_nodes] | |
| with open(CHUNKS_FILE, "wb") as f: | |
| pickle.dump(chunks, f) | |
| print(f"✅ {len(chunks)} chunks sauvegardés dans {CHUNKS_FILE}") | |