import os import json import time import pandas as pd import time from langchain_openai import OpenAIEmbeddings from langchain_mistralai.embeddings import MistralAIEmbeddings from langchain_pinecone import PineconeVectorStore from langchain_core.documents import Document from langchain.text_splitter import RecursiveCharacterTextSplitter from pinecone import Pinecone, ServerlessSpec from dotenv import load_dotenv load_dotenv() index_name = os.environ.get("PINECONE_INDEX_NAME") # namespace = os.environ.get("PINECONE_NAMESPACE") pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY")) existing_indexes = [index_info["name"] for index_info in pc.list_indexes()] if index_name not in existing_indexes: pc.create_index( name=index_name, dimension=1024, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1"), ) while not pc.describe_index(index_name).status["ready"]: time.sleep(1) index = pc.Index(index_name) print(index_name) def get_text_chunks(text): text_splitter = RecursiveCharacterTextSplitter( chunk_size=500, # the character length of the chunck chunk_overlap=100, # the character length of the overlap between chuncks length_function=len # the length function - in this case, character length (aka the python len() fn.) ) chunks = text_splitter.split_text(text) return chunks def importAideEntreprise(subvention, source): try: # Initialiser le modèle d'embeddings OpenAI # embedding = OpenAIEmbeddings(model="text-embedding-ada-002") # Remplacez par votre choix de modèle embedding = MistralAIEmbeddings(model="mistral-embed", mistral_api_key=os.environ.get("MISTRAL_API_KEY")) vector_store = PineconeVectorStore(index=index, embedding=embedding) # namespace=namespace if source == "aides_entreprises.json": if 'contacts' in subvention: del subvention['contacts'] if 'contact' in subvention: del subvention['contact'] if 'profils' in subvention: del subvention['profils'] if 'projets' in subvention: del subvention['projets'] if 'cache_indexation' in subvention: del subvention['cache_indexation'] metadata = { **subvention.get("metadata", {}), "id_subvention": subvention['id_aid'], "deadline_date": subvention['date_fin'] if 'date_fin' in subvention and subvention['date_fin'] is not None else -1, "id_document": f"entreprises_{subvention['id_aid']}" } elif source == "aides_territoires.json": metadata = { **subvention.get("metadata", {}), "id_subvention": subvention['id'], "deadline_date": subvention['submission_deadline'] if 'submission_deadline' in subvention and subvention['submission_deadline'] is not None else -1, "id_document": f"territoires_{subvention['id']}" } elif source == "les_aides.json": if 'cci' in subvention: del subvention['cci'] if 'url' in subvention: del subvention['url'] metadata = subvention.get("metadata", {}) if "sirets" in metadata: del metadata["sirets"] metadata = { **metadata, "id_subvention": subvention['numero'], "deadline_date": -1, "id_document": f"aides_{subvention['numero']}" } # Combine JSON data as a single document for embedding json_text = json.dumps(subvention, indent=4) # Convert JSON document to string document = Document(page_content=json_text, metadata=metadata) # Create document with metadata # Generate a unique ID uuid = metadata["id_document"] # Using 'id_document' as a unique ID print("Before add_documents") # Embed and store the document in Pinecone vector_store.add_documents(documents=[document], ids=[uuid]) print(f"Stored document with ID: {uuid} from source: {source}") except Exception as e: print(f"Error storing document: {e}") print(e) import traceback print(f"Error storing document: {e}") traceback.print_exc() def loopSubventions(subventions,source): for subv in subventions: importAideEntreprise(subv,source) time.sleep(0.5) def go(): print("Importing Aide Entreprise data...") for file_name in os.listdir('data'): if file_name.endswith(".json"): print(file_name) file_path = os.path.join('data', file_name) if file_name == "les_aides.json": with open(file_path, "r", encoding="utf-8") as f: data = json.load(f) loopSubventions(data,source=file_name) if __name__ == "__main__": go()