ia-subvention / data /importToDb.py
Update data handling and API request parameters for aides
76f1467
raw
history blame
5.08 kB
import os
import json
import time
import pandas as pd
import time
from langchain_openai import OpenAIEmbeddings
from langchain_mistralai.embeddings import MistralAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
load_dotenv()
index_name = os.environ.get("PINECONE_INDEX_NAME")
# namespace = os.environ.get("PINECONE_NAMESPACE")
pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
if index_name not in existing_indexes:
pc.create_index(
name=index_name,
dimension=1024,
metric="cosine",
spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)
while not pc.describe_index(index_name).status["ready"]:
time.sleep(1)
index = pc.Index(index_name)
print(index_name)
def get_text_chunks(text):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500, # the character length of the chunck
chunk_overlap=100, # the character length of the overlap between chuncks
length_function=len # the length function - in this case, character length (aka the python len() fn.)
)
chunks = text_splitter.split_text(text)
return chunks
def importAideEntreprise(subvention, source):
try:
# Initialiser le modèle d'embeddings OpenAI
# embedding = OpenAIEmbeddings(model="text-embedding-ada-002") # Remplacez par votre choix de modèle
embedding = MistralAIEmbeddings(model="mistral-embed", mistral_api_key=os.environ.get("MISTRAL_API_KEY"))
vector_store = PineconeVectorStore(index=index, embedding=embedding) # namespace=namespace
if source == "aides_entreprises.json":
if 'contacts' in subvention:
del subvention['contacts']
if 'contact' in subvention:
del subvention['contact']
if 'profils' in subvention:
del subvention['profils']
if 'projets' in subvention:
del subvention['projets']
if 'cache_indexation' in subvention:
del subvention['cache_indexation']
metadata = {
**subvention.get("metadata", {}),
"id_subvention": subvention['id_aid'],
"deadline_date": subvention['date_fin'] if 'date_fin' in subvention and subvention['date_fin'] is not None else -1,
"id_document": f"entreprises_{subvention['id_aid']}"
}
elif source == "aides_territoires.json":
metadata = {
**subvention.get("metadata", {}),
"id_subvention": subvention['id'],
"deadline_date": subvention['submission_deadline'] if 'submission_deadline' in subvention and subvention['submission_deadline'] is not None else -1,
"id_document": f"territoires_{subvention['id']}"
}
elif source == "les_aides.json":
if 'cci' in subvention:
del subvention['cci']
if 'url' in subvention:
del subvention['url']
metadata = subvention.get("metadata", {})
if "sirets" in metadata:
del metadata["sirets"]
metadata = {
**metadata,
"id_subvention": subvention['numero'],
"deadline_date": -1,
"id_document": f"aides_{subvention['numero']}"
}
# Combine JSON data as a single document for embedding
json_text = json.dumps(subvention, indent=4) # Convert JSON document to string
document = Document(page_content=json_text, metadata=metadata) # Create document with metadata
# Generate a unique ID
uuid = metadata["id_document"] # Using 'id_document' as a unique ID
print("Before add_documents")
# Embed and store the document in Pinecone
vector_store.add_documents(documents=[document], ids=[uuid])
print(f"Stored document with ID: {uuid} from source: {source}")
except Exception as e:
print(f"Error storing document: {e}")
print(e)
import traceback
print(f"Error storing document: {e}")
traceback.print_exc()
def loopSubventions(subventions,source):
for subv in subventions:
importAideEntreprise(subv,source)
time.sleep(0.5)
def go():
print("Importing Aide Entreprise data...")
for file_name in os.listdir('data'):
if file_name.endswith(".json"):
print(file_name)
file_path = os.path.join('data', file_name)
if file_name == "les_aides.json":
with open(file_path, "r", encoding="utf-8") as f:
data = json.load(f)
loopSubventions(data,source=file_name)
if __name__ == "__main__":
go()