Spaces:
Running
Running
File size: 5,443 Bytes
6236000 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
import os
import json
import time
import pandas as pd
import time
from langchain_openai import OpenAIEmbeddings
from langchain_mistralai.embeddings import MistralAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
load_dotenv()
index_name = os.environ.get("PINECONE_INDEX_NAME")
# namespace = os.environ.get("PINECONE_NAMESPACE")
pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
if index_name not in existing_indexes:
pc.create_index(
name=index_name,
dimension=1024,
metric="cosine",
spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)
while not pc.describe_index(index_name).status["ready"]:
time.sleep(1)
index = pc.Index(index_name)
print(index_name)
def get_text_chunks(text):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500, # the character length of the chunck
chunk_overlap=100, # the character length of the overlap between chuncks
length_function=len # the length function - in this case, character length (aka the python len() fn.)
)
chunks = text_splitter.split_text(text)
return chunks
def importAideEntreprise(subvention, source):
try:
# Initialiser le modèle d'embeddings OpenAI
# embedding = OpenAIEmbeddings(model="text-embedding-ada-002") # Remplacez par votre choix de modèle
embedding = MistralAIEmbeddings(model="mistral-embed", mistral_api_key=os.environ.get("MISTRAL_API_KEY"))
vector_store = PineconeVectorStore(index=index, embedding=embedding) # namespace=namespace
if source == "aides_entreprises.json":
if 'contacts' in subvention:
del subvention['contacts']
if 'contact' in subvention:
del subvention['contact']
if 'profils' in subvention:
del subvention['profils']
if 'projets' in subvention:
del subvention['projets']
if 'cache_indexation' in subvention:
del subvention['cache_indexation']
metadata = {
**subvention.get("metadata", {}),
"id_subvention": subvention['id_aid'],
"deadline_date": subvention['date_fin'] if 'date_fin' in subvention and subvention['date_fin'] is not None else -1,
"id_document": f"entreprises_{subvention['id_aid']}"
}
elif source == "aides_territoires.json":
metadata = {
**subvention.get("metadata", {}),
"id_subvention": subvention['id'],
"deadline_date": subvention['submission_deadline'] if 'submission_deadline' in subvention and subvention['submission_deadline'] is not None else -1,
"id_document": f"territoires_{subvention['id']}"
}
elif source == "cegara_sub.json":
if 'support' in subvention:
del subvention['support']
if 'html' in subvention:
del subvention['html']
metadata = {
**subvention.get("metadata", {}),
"id_subvention": subvention['id'],
"deadline_date": subvention['validite_fin'] if 'validite_fin' in subvention and subvention['validite_fin'] is not None else -1,
"id_document": f"cegara_{subvention['id']}"
}
elif source == "les_aides.json":
if 'cci' in subvention:
del subvention['cci']
if 'url' in subvention:
del subvention['url']
metadata = {
**subvention.get("metadata", {}),
"id_subvention": subvention['numero'],
"deadline_date": -1,
"id_document": f"aides_{subvention['numero']}"
}
# Combine JSON data as a single document for embedding
json_text = json.dumps(subvention, indent=4) # Convert JSON document to string
document = Document(page_content=json_text, metadata=metadata) # Create document with metadata
# Generate a unique ID
uuid = metadata["id_document"] # Using 'id_document' as a unique ID
print("Before add_documents")
# Embed and store the document in Pinecone
vector_store.add_documents(documents=[document], ids=[uuid])
print(f"Stored document with ID: {uuid} from source: {source}")
except Exception as e:
print(f"Error storing document: {e}")
print(e)
import traceback
print(f"Error storing document: {e}")
traceback.print_exc()
def loopSubventions(subventions,source):
for subv in subventions:
importAideEntreprise(subv,source)
time.sleep(0.5)
def go():
print("Importing Aide Entreprise data...")
for file_name in os.listdir('data'):
if file_name.endswith(".json"):
print(file_name)
file_path = os.path.join('data', file_name)
with open(file_path, "r", encoding="utf-8") as f:
data = json.load(f)
loopSubventions(data,source=file_name)
if __name__ == "__main__":
go() |