ia-subvention / data /importToDb.py
Add script to get API data and import to vectore store
6236000
raw
history blame
5.44 kB
import os
import json
import time
import pandas as pd
import time
from langchain_openai import OpenAIEmbeddings
from langchain_mistralai.embeddings import MistralAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
load_dotenv()
index_name = os.environ.get("PINECONE_INDEX_NAME")
# namespace = os.environ.get("PINECONE_NAMESPACE")
pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
if index_name not in existing_indexes:
pc.create_index(
name=index_name,
dimension=1024,
metric="cosine",
spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)
while not pc.describe_index(index_name).status["ready"]:
time.sleep(1)
index = pc.Index(index_name)
print(index_name)
def get_text_chunks(text):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500, # the character length of the chunck
chunk_overlap=100, # the character length of the overlap between chuncks
length_function=len # the length function - in this case, character length (aka the python len() fn.)
)
chunks = text_splitter.split_text(text)
return chunks
def importAideEntreprise(subvention, source):
try:
# Initialiser le modèle d'embeddings OpenAI
# embedding = OpenAIEmbeddings(model="text-embedding-ada-002") # Remplacez par votre choix de modèle
embedding = MistralAIEmbeddings(model="mistral-embed", mistral_api_key=os.environ.get("MISTRAL_API_KEY"))
vector_store = PineconeVectorStore(index=index, embedding=embedding) # namespace=namespace
if source == "aides_entreprises.json":
if 'contacts' in subvention:
del subvention['contacts']
if 'contact' in subvention:
del subvention['contact']
if 'profils' in subvention:
del subvention['profils']
if 'projets' in subvention:
del subvention['projets']
if 'cache_indexation' in subvention:
del subvention['cache_indexation']
metadata = {
**subvention.get("metadata", {}),
"id_subvention": subvention['id_aid'],
"deadline_date": subvention['date_fin'] if 'date_fin' in subvention and subvention['date_fin'] is not None else -1,
"id_document": f"entreprises_{subvention['id_aid']}"
}
elif source == "aides_territoires.json":
metadata = {
**subvention.get("metadata", {}),
"id_subvention": subvention['id'],
"deadline_date": subvention['submission_deadline'] if 'submission_deadline' in subvention and subvention['submission_deadline'] is not None else -1,
"id_document": f"territoires_{subvention['id']}"
}
elif source == "cegara_sub.json":
if 'support' in subvention:
del subvention['support']
if 'html' in subvention:
del subvention['html']
metadata = {
**subvention.get("metadata", {}),
"id_subvention": subvention['id'],
"deadline_date": subvention['validite_fin'] if 'validite_fin' in subvention and subvention['validite_fin'] is not None else -1,
"id_document": f"cegara_{subvention['id']}"
}
elif source == "les_aides.json":
if 'cci' in subvention:
del subvention['cci']
if 'url' in subvention:
del subvention['url']
metadata = {
**subvention.get("metadata", {}),
"id_subvention": subvention['numero'],
"deadline_date": -1,
"id_document": f"aides_{subvention['numero']}"
}
# Combine JSON data as a single document for embedding
json_text = json.dumps(subvention, indent=4) # Convert JSON document to string
document = Document(page_content=json_text, metadata=metadata) # Create document with metadata
# Generate a unique ID
uuid = metadata["id_document"] # Using 'id_document' as a unique ID
print("Before add_documents")
# Embed and store the document in Pinecone
vector_store.add_documents(documents=[document], ids=[uuid])
print(f"Stored document with ID: {uuid} from source: {source}")
except Exception as e:
print(f"Error storing document: {e}")
print(e)
import traceback
print(f"Error storing document: {e}")
traceback.print_exc()
def loopSubventions(subventions,source):
for subv in subventions:
importAideEntreprise(subv,source)
time.sleep(0.5)
def go():
print("Importing Aide Entreprise data...")
for file_name in os.listdir('data'):
if file_name.endswith(".json"):
print(file_name)
file_path = os.path.join('data', file_name)
with open(file_path, "r", encoding="utf-8") as f:
data = json.load(f)
loopSubventions(data,source=file_name)
if __name__ == "__main__":
go()