import os | |
import json | |
import time | |
import pandas as pd | |
import time | |
from langchain_openai import OpenAIEmbeddings | |
from langchain_mistralai.embeddings import MistralAIEmbeddings | |
from langchain_pinecone import PineconeVectorStore | |
from langchain_core.documents import Document | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from pinecone import Pinecone, ServerlessSpec | |
from dotenv import load_dotenv | |
load_dotenv() | |
index_name = os.environ.get("PINECONE_INDEX_NAME") | |
# namespace = os.environ.get("PINECONE_NAMESPACE") | |
pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY")) | |
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()] | |
if index_name not in existing_indexes: | |
pc.create_index( | |
name=index_name, | |
dimension=1024, | |
metric="cosine", | |
spec=ServerlessSpec(cloud="aws", region="us-east-1"), | |
) | |
while not pc.describe_index(index_name).status["ready"]: | |
time.sleep(1) | |
index = pc.Index(index_name) | |
print(index_name) | |
def get_text_chunks(text): | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=500, # the character length of the chunck | |
chunk_overlap=100, # the character length of the overlap between chuncks | |
length_function=len # the length function - in this case, character length (aka the python len() fn.) | |
) | |
chunks = text_splitter.split_text(text) | |
return chunks | |
def importAideEntreprise(subvention, source): | |
try: | |
# Initialiser le modèle d'embeddings OpenAI | |
# embedding = OpenAIEmbeddings(model="text-embedding-ada-002") # Remplacez par votre choix de modèle | |
embedding = MistralAIEmbeddings(model="mistral-embed", mistral_api_key=os.environ.get("MISTRAL_API_KEY")) | |
vector_store = PineconeVectorStore(index=index, embedding=embedding) # namespace=namespace | |
if source == "aides_entreprises.json": | |
if 'contacts' in subvention: | |
del subvention['contacts'] | |
if 'contact' in subvention: | |
del subvention['contact'] | |
if 'profils' in subvention: | |
del subvention['profils'] | |
if 'projets' in subvention: | |
del subvention['projets'] | |
if 'cache_indexation' in subvention: | |
del subvention['cache_indexation'] | |
metadata = { | |
**subvention.get("metadata", {}), | |
"id_subvention": subvention['id_aid'], | |
"deadline_date": subvention['date_fin'] if 'date_fin' in subvention and subvention['date_fin'] is not None else -1, | |
"id_document": f"entreprises_{subvention['id_aid']}" | |
} | |
elif source == "aides_territoires.json": | |
metadata = { | |
**subvention.get("metadata", {}), | |
"id_subvention": subvention['id'], | |
"deadline_date": subvention['submission_deadline'] if 'submission_deadline' in subvention and subvention['submission_deadline'] is not None else -1, | |
"id_document": f"territoires_{subvention['id']}" | |
} | |
elif source == "cegara_sub.json": | |
if 'support' in subvention: | |
del subvention['support'] | |
if 'html' in subvention: | |
del subvention['html'] | |
metadata = { | |
**subvention.get("metadata", {}), | |
"id_subvention": subvention['id'], | |
"deadline_date": subvention['validite_fin'] if 'validite_fin' in subvention and subvention['validite_fin'] is not None else -1, | |
"id_document": f"cegara_{subvention['id']}" | |
} | |
elif source == "les_aides.json": | |
if 'cci' in subvention: | |
del subvention['cci'] | |
if 'url' in subvention: | |
del subvention['url'] | |
metadata = { | |
**subvention.get("metadata", {}), | |
"id_subvention": subvention['numero'], | |
"deadline_date": -1, | |
"id_document": f"aides_{subvention['numero']}" | |
} | |
# Combine JSON data as a single document for embedding | |
json_text = json.dumps(subvention, indent=4) # Convert JSON document to string | |
document = Document(page_content=json_text, metadata=metadata) # Create document with metadata | |
# Generate a unique ID | |
uuid = metadata["id_document"] # Using 'id_document' as a unique ID | |
print("Before add_documents") | |
# Embed and store the document in Pinecone | |
vector_store.add_documents(documents=[document], ids=[uuid]) | |
print(f"Stored document with ID: {uuid} from source: {source}") | |
except Exception as e: | |
print(f"Error storing document: {e}") | |
print(e) | |
import traceback | |
print(f"Error storing document: {e}") | |
traceback.print_exc() | |
def loopSubventions(subventions,source): | |
for subv in subventions: | |
importAideEntreprise(subv,source) | |
time.sleep(0.5) | |
def go(): | |
print("Importing Aide Entreprise data...") | |
for file_name in os.listdir('data'): | |
if file_name.endswith(".json"): | |
print(file_name) | |
file_path = os.path.join('data', file_name) | |
with open(file_path, "r", encoding="utf-8") as f: | |
data = json.load(f) | |
loopSubventions(data,source=file_name) | |
if __name__ == "__main__": | |
go() |