File size: 5,443 Bytes
6236000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import os
import json
import time
import pandas as pd
import time
from langchain_openai import OpenAIEmbeddings
from langchain_mistralai.embeddings import MistralAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pinecone import Pinecone, ServerlessSpec

from dotenv import load_dotenv

load_dotenv()

index_name = os.environ.get("PINECONE_INDEX_NAME")
# namespace = os.environ.get("PINECONE_NAMESPACE")

pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=1024,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

print(index_name)

def get_text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500, # the character length of the chunck
        chunk_overlap=100, # the character length of the overlap between chuncks
        length_function=len # the length function - in this case, character length (aka the python len() fn.)
    )
    chunks = text_splitter.split_text(text)
    return chunks


def importAideEntreprise(subvention, source):

    try:
        # Initialiser le modèle d'embeddings OpenAI
        # embedding = OpenAIEmbeddings(model="text-embedding-ada-002")  # Remplacez par votre choix de modèle
        embedding = MistralAIEmbeddings(model="mistral-embed", mistral_api_key=os.environ.get("MISTRAL_API_KEY"))
        vector_store = PineconeVectorStore(index=index, embedding=embedding) # namespace=namespace


        if source == "aides_entreprises.json":

            if 'contacts' in subvention:
                del subvention['contacts']
            if 'contact' in subvention:
                del subvention['contact']
            if 'profils' in subvention:
                del subvention['profils']
            if 'projets' in subvention:
                del subvention['projets']
            if 'cache_indexation' in subvention:
                del subvention['cache_indexation']

            metadata = {
                **subvention.get("metadata", {}),
                "id_subvention": subvention['id_aid'],
                "deadline_date": subvention['date_fin'] if 'date_fin' in subvention and subvention['date_fin'] is not None else -1,
                "id_document": f"entreprises_{subvention['id_aid']}"
            }
        elif source == "aides_territoires.json":
            metadata = {
                **subvention.get("metadata", {}),
                "id_subvention": subvention['id'],
                "deadline_date": subvention['submission_deadline'] if 'submission_deadline' in subvention and subvention['submission_deadline'] is not None else -1,
                "id_document": f"territoires_{subvention['id']}"
            }
        elif source == "cegara_sub.json":
            if 'support' in subvention:
                del subvention['support']
            if 'html' in subvention:
                del subvention['html']

            metadata = {
                **subvention.get("metadata", {}),
                "id_subvention": subvention['id'],
                "deadline_date": subvention['validite_fin'] if 'validite_fin' in subvention and subvention['validite_fin'] is not None else -1,
                "id_document": f"cegara_{subvention['id']}"
            }
        elif source == "les_aides.json":

            if 'cci' in subvention:
                del subvention['cci']
            if 'url' in subvention:
                del subvention['url']

            metadata = {
                **subvention.get("metadata", {}),
                "id_subvention": subvention['numero'],
                "deadline_date": -1,
                "id_document": f"aides_{subvention['numero']}"
            }


        # Combine JSON data as a single document for embedding
        json_text = json.dumps(subvention, indent=4)  # Convert JSON document to string
        document = Document(page_content=json_text, metadata=metadata)  # Create document with metadata
        
        # Generate a unique ID
        uuid = metadata["id_document"]  # Using 'id_document' as a unique ID
        
        print("Before add_documents")
        # Embed and store the document in Pinecone
        vector_store.add_documents(documents=[document], ids=[uuid])

        print(f"Stored document with ID: {uuid} from source: {source}")
    except Exception as e:
        print(f"Error storing document: {e}")
        print(e)
        
        import traceback
        print(f"Error storing document: {e}")
        traceback.print_exc()


def loopSubventions(subventions,source):
    for subv in subventions:
        importAideEntreprise(subv,source)
        time.sleep(0.5)
        

def go():

    print("Importing Aide Entreprise data...")

    for file_name in os.listdir('data'):
        if file_name.endswith(".json"):
            print(file_name)
            file_path = os.path.join('data', file_name)
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)

                loopSubventions(data,source=file_name)

if __name__ == "__main__":
    go()