Spaces:
Running
Running
[email protected]
commited on
Commit
·
6236000
1
Parent(s):
18107fd
Add script to get API data and import to vectore store
Browse files- data/__init__.py +0 -0
- data/aides_territoires.py +80 -0
- data/get_aides_cegara.py +51 -0
- data/get_aides_entreprises.py +100 -0
- data/get_les_aides.py +93 -0
- data/importToDb.py +151 -0
data/__init__.py
ADDED
File without changes
|
data/aides_territoires.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import http.client
|
2 |
+
import json
|
3 |
+
|
4 |
+
conn = http.client.HTTPSConnection("aides-territoires.beta.gouv.fr")
|
5 |
+
headersConnexion = {
|
6 |
+
'X-AUTH-TOKEN': 'eeb481e42950f1dbfc46dc348e6e32a0c631cc5b94dd7ab874a30c027f9de87c',
|
7 |
+
}
|
8 |
+
|
9 |
+
|
10 |
+
def connexion():
|
11 |
+
conn.request("POST", "/api/connexion/", '', headersConnexion)
|
12 |
+
data = conn.getresponse().read()
|
13 |
+
return json.loads(data.decode("utf-8"))['token']
|
14 |
+
|
15 |
+
def get_final_type(types):
|
16 |
+
type_mapping = {
|
17 |
+
"Subvention": 2,
|
18 |
+
"Prêt": 1,
|
19 |
+
"Avance récupérable": 1,
|
20 |
+
"Certificat d'économie d'énergie (CEE)": 4,
|
21 |
+
"Autre aide financière": 4,
|
22 |
+
"Ingénierie technique": 4,
|
23 |
+
"Ingénierie financière": 4,
|
24 |
+
"Ingénierie Juridique / administrative": 4
|
25 |
+
}
|
26 |
+
|
27 |
+
final_type_mapping = {
|
28 |
+
1: "Avance − Prêts − Garanties",
|
29 |
+
2: "Subvention",
|
30 |
+
3: "Prise en charge des coûts et allègement des charges",
|
31 |
+
4: "Autres"
|
32 |
+
}
|
33 |
+
|
34 |
+
for t in types:
|
35 |
+
if t in type_mapping:
|
36 |
+
return final_type_mapping[type_mapping[t]]
|
37 |
+
|
38 |
+
return None # Return None if no matching type is found
|
39 |
+
|
40 |
+
|
41 |
+
def request():
|
42 |
+
subventions = []
|
43 |
+
page = 1
|
44 |
+
|
45 |
+
while True:
|
46 |
+
conn.request("GET", f"/api/aids/?page={page}&organization_type_slugs=farmer&perimeter_id=70971-nouvelle-aquitaine", '', headersRequest)
|
47 |
+
res = conn.getresponse().read()
|
48 |
+
resData = res.decode("utf-8")
|
49 |
+
data = json.loads(resData)
|
50 |
+
|
51 |
+
subventions += data['results']
|
52 |
+
|
53 |
+
if data['next'] is None:
|
54 |
+
break
|
55 |
+
|
56 |
+
page += 1
|
57 |
+
|
58 |
+
print(f"Total subventions : {len(data['results'])}")
|
59 |
+
return subventions
|
60 |
+
|
61 |
+
|
62 |
+
token = connexion()
|
63 |
+
headersRequest = {
|
64 |
+
'Authorization': 'Bearer ' + token,
|
65 |
+
}
|
66 |
+
|
67 |
+
aides = request()
|
68 |
+
print(f"Nb aides : {len(aides)}")
|
69 |
+
|
70 |
+
for aide in aides:
|
71 |
+
aide["metadata"] = {
|
72 |
+
"type_aide": get_final_type(aide["aid_types"]),
|
73 |
+
"lien": f"https://aides-territoires.beta.gouv.fr/aides/{aide['slug']}",
|
74 |
+
"Source": f"https://aides-territoires.beta.gouv.fr"
|
75 |
+
}
|
76 |
+
|
77 |
+
|
78 |
+
# print(subventions)
|
79 |
+
with open('aides_territoires.json', 'w', encoding='utf-8') as f:
|
80 |
+
json.dump(aides, f, ensure_ascii=False, indent=4)
|
data/get_aides_cegara.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import requests
|
3 |
+
from datetime import *
|
4 |
+
|
5 |
+
def get_final_type(types):
|
6 |
+
type_mapping = {
|
7 |
+
"Subvention": 2,
|
8 |
+
"Prêt": 1,
|
9 |
+
"Avance récupérable": 1,
|
10 |
+
"Certificat d'économie d'énergie (CEE)": 4,
|
11 |
+
"Autre aide financière": 4,
|
12 |
+
"Ingénierie technique": 4,
|
13 |
+
"Ingénierie financière": 4,
|
14 |
+
"Ingénierie Juridique / administrative": 4
|
15 |
+
}
|
16 |
+
|
17 |
+
final_type_mapping = {
|
18 |
+
1: "Avance − Prêts − Garanties",
|
19 |
+
2: "Subvention",
|
20 |
+
3: "Prise en charge des coûts et allègement des charges",
|
21 |
+
4: "Autres"
|
22 |
+
}
|
23 |
+
|
24 |
+
for t in types:
|
25 |
+
if t in type_mapping:
|
26 |
+
return final_type_mapping[type_mapping[t]]
|
27 |
+
|
28 |
+
return None # Return None if no matching type is found
|
29 |
+
|
30 |
+
|
31 |
+
def request():
|
32 |
+
subventions = requests.get('https://api.groupecegara.fr/aidesliste').json()
|
33 |
+
return subventions
|
34 |
+
|
35 |
+
aides = request()
|
36 |
+
print(f"Nb aides : {len(aides)}")
|
37 |
+
|
38 |
+
for aide in aides:
|
39 |
+
aide["metadata"] = {
|
40 |
+
"type_aide": "Subvention",
|
41 |
+
"lien": f"https://www.groupecegara.fr/publications-flash.html",
|
42 |
+
"Source": f"https://www.groupecegara.fr/publications-flash.html"
|
43 |
+
}
|
44 |
+
|
45 |
+
|
46 |
+
# print(subventions)
|
47 |
+
# week_number = datetime.today().isocalendar()[1]
|
48 |
+
# filename = 'cegara_sub_' + str(week_number)
|
49 |
+
# with open('{}.json'.format(filename), 'w', encoding='utf-8') as f:
|
50 |
+
with open('data/cegara_sub.json', 'w', encoding='utf-8') as f:
|
51 |
+
json.dump(aides, f, ensure_ascii=False, indent=4)
|
data/get_aides_entreprises.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import http.client
|
2 |
+
import json
|
3 |
+
|
4 |
+
conn = http.client.HTTPSConnection("api.aides-entreprises.fr")
|
5 |
+
payload = ''
|
6 |
+
headers = {
|
7 |
+
'X-Aidesentreprises-Id': 'jb4nMj67',
|
8 |
+
'X-Aidesentreprises-Key': 'waMF2TjO',
|
9 |
+
}
|
10 |
+
|
11 |
+
|
12 |
+
def request(limit=20, offset=0):
|
13 |
+
print(f"Requesting {limit} subventions from offset {offset}")
|
14 |
+
conn.request("GET", f"/v1.1/aides?profils=20&territoire=50002&limit={limit}&offset={offset}", payload, headers)
|
15 |
+
res = conn.getresponse()
|
16 |
+
data = res.read()
|
17 |
+
return json.loads(data.decode("utf-8"))['data']
|
18 |
+
|
19 |
+
|
20 |
+
def get_final_type(types):
|
21 |
+
type_mapping = {
|
22 |
+
"16": 3, # Exonération de charges sociales
|
23 |
+
"15": 1, # Prêt d'honneur
|
24 |
+
"12": 4, # Prix
|
25 |
+
"3": 2, # Subvention
|
26 |
+
"5": 1, # Prêt
|
27 |
+
"4": 1, # Avance remboursable
|
28 |
+
"7": 1, # Garantie
|
29 |
+
"8": 3, # Allègement fiscal
|
30 |
+
"6": 4, # Bonification d'intérêt
|
31 |
+
"9": 4, # Participation en capital
|
32 |
+
"14": 4, # Appel à projet
|
33 |
+
"11": 4 # Crédit-bail
|
34 |
+
}
|
35 |
+
|
36 |
+
final_type_mapping = {
|
37 |
+
1: "Avance − Prêts − Garanties",
|
38 |
+
2: "Subvention",
|
39 |
+
3: "Prise en charge des coûts et allègement des charges",
|
40 |
+
4: "Autres"
|
41 |
+
}
|
42 |
+
|
43 |
+
for t in types:
|
44 |
+
if t["id_typ"] in type_mapping:
|
45 |
+
return final_type_mapping[type_mapping[t["id_typ"]]]
|
46 |
+
|
47 |
+
return None # Return None if no matching type is found
|
48 |
+
|
49 |
+
|
50 |
+
def getAide(aide):
|
51 |
+
|
52 |
+
conn.request("GET", f"/v1.1/aides/{aide['id_aid']}", payload, headers)
|
53 |
+
res = conn.getresponse()
|
54 |
+
aide = json.loads(res.read().decode("utf-8"))[0]
|
55 |
+
|
56 |
+
# if 'contacts' in aide:
|
57 |
+
# del aide['contacts']
|
58 |
+
# if 'contact' in aide:
|
59 |
+
# del aide['contact']
|
60 |
+
# if 'profils' in aide:
|
61 |
+
# del aide['profils']
|
62 |
+
# if 'projets' in aide:
|
63 |
+
# del aide['projets']
|
64 |
+
# if 'cache_indexation' in aide:
|
65 |
+
# del aide['cache_indexation']
|
66 |
+
|
67 |
+
print(aide["prets"])
|
68 |
+
aide["metadata"] = {
|
69 |
+
"type_aide": get_final_type(aide["prets"]),
|
70 |
+
"lien": f"https://aides-entreprises.fr/aide/{aide['id_aid']}",
|
71 |
+
"Source": f"https://aides-entreprises.fr"
|
72 |
+
}
|
73 |
+
|
74 |
+
return aide
|
75 |
+
|
76 |
+
# Set to 400 to get all subventions, pagination doesn't work !!!
|
77 |
+
take = 400
|
78 |
+
skip = 0
|
79 |
+
|
80 |
+
subventions = []
|
81 |
+
|
82 |
+
while True:
|
83 |
+
|
84 |
+
responses = request(take, skip)
|
85 |
+
subventions += responses
|
86 |
+
|
87 |
+
print(f"Got {len(responses)} subventions")
|
88 |
+
if len(responses) < take:
|
89 |
+
break
|
90 |
+
|
91 |
+
skip += take
|
92 |
+
|
93 |
+
|
94 |
+
for i in range(len(subventions)):
|
95 |
+
subventions[i] = getAide(subventions[i])
|
96 |
+
|
97 |
+
|
98 |
+
# print(subventions)
|
99 |
+
with open('data/aides_entreprises.json', 'w', encoding='utf-8') as f:
|
100 |
+
json.dump(subventions, f, ensure_ascii=False, indent=4)
|
data/get_les_aides.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import http.client
|
2 |
+
import json
|
3 |
+
|
4 |
+
conn = http.client.HTTPSConnection("api.les-aides.fr")
|
5 |
+
payload = ''
|
6 |
+
headers = {
|
7 |
+
'X-IDC': 'bcfac1828e5ef1b7cab084379a5f2a871e82ee7c',
|
8 |
+
}
|
9 |
+
|
10 |
+
requestID = 23994079
|
11 |
+
filieres = [ 289, 290 ]
|
12 |
+
domaines = [893,883,877,790,793,798,802,805,862,807,810,813,816,820,818]
|
13 |
+
|
14 |
+
def request(filiere, domaines):
|
15 |
+
conn.request("GET", f"/aides?ape=A®ion=75&domaine={domaines}&filiere={filiere}", payload, headers)
|
16 |
+
res = conn.getresponse()
|
17 |
+
data = res.read()
|
18 |
+
data = json.loads(data.decode("utf-8"))
|
19 |
+
|
20 |
+
print(f"Total subventions : {len(data['dispositifs'])}")
|
21 |
+
|
22 |
+
for i in range(len(data['dispositifs'])):
|
23 |
+
data['dispositifs'][i]['idr'] = data['idr']
|
24 |
+
|
25 |
+
return data['dispositifs']
|
26 |
+
|
27 |
+
|
28 |
+
def get_final_type(types):
|
29 |
+
|
30 |
+
|
31 |
+
return None # Return None if no matching type is found
|
32 |
+
|
33 |
+
|
34 |
+
def getAide(aide):
|
35 |
+
|
36 |
+
try:
|
37 |
+
conn.request("GET", f"/aide/?requete={aide['idr']}&dispositif={aide['numero']}", payload, headers)
|
38 |
+
res = conn.getresponse()
|
39 |
+
aide = json.loads(res.read().decode("utf-8"))
|
40 |
+
|
41 |
+
# if 'cci' in aide:
|
42 |
+
# del aide['cci']
|
43 |
+
# if 'url' in aide:
|
44 |
+
# del aide['url']
|
45 |
+
|
46 |
+
|
47 |
+
aide["metadata"] = {
|
48 |
+
# "type_aide": get_final_type(aide["prets"]),
|
49 |
+
"lien": aide['uri'],
|
50 |
+
"Source": f"https://les-aides.fr/"
|
51 |
+
}
|
52 |
+
|
53 |
+
return aide
|
54 |
+
except Exception as e:
|
55 |
+
print(aide)
|
56 |
+
print(f"Error: {e}")
|
57 |
+
return None
|
58 |
+
|
59 |
+
|
60 |
+
subventions = []
|
61 |
+
|
62 |
+
|
63 |
+
# Split domaines into two sections
|
64 |
+
domaines_section_2 = "[893,883,877,790,793,798,802]"
|
65 |
+
domaines_section_1 = "[805,862,807,810,813,816,820,818]"
|
66 |
+
|
67 |
+
|
68 |
+
# Function to add dispositifs to subventions list without duplication
|
69 |
+
def add_dispositifs(filiere, domaines_section):
|
70 |
+
print(f"Requesting filiere {filiere} and domaines {domaines_section}")
|
71 |
+
|
72 |
+
dispositifs = request(filiere, domaines_section)
|
73 |
+
for dispositif in dispositifs:
|
74 |
+
if not any(subvention.get("numero") == dispositif["numero"] for subvention in subventions if isinstance(subvention, dict)):
|
75 |
+
subventions.append(dispositif)
|
76 |
+
|
77 |
+
|
78 |
+
# Call request function with different parameters
|
79 |
+
for filiere in filieres:
|
80 |
+
add_dispositifs(filiere, domaines_section_1)
|
81 |
+
add_dispositifs(filiere, domaines_section_2)
|
82 |
+
|
83 |
+
|
84 |
+
print(f"Nb aides : {len(subventions)}")
|
85 |
+
|
86 |
+
for i in range(len(subventions)):
|
87 |
+
print(f"{subventions[i]['numero']} : {i} / {len(subventions)}")
|
88 |
+
subventions[i] = getAide(subventions[i])
|
89 |
+
|
90 |
+
|
91 |
+
# print(subventions)
|
92 |
+
with open('data/les_aides.json', 'w', encoding='utf-8') as f:
|
93 |
+
json.dump(subventions, f, ensure_ascii=False, indent=4)
|
data/importToDb.py
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import time
|
4 |
+
import pandas as pd
|
5 |
+
import time
|
6 |
+
from langchain_openai import OpenAIEmbeddings
|
7 |
+
from langchain_mistralai.embeddings import MistralAIEmbeddings
|
8 |
+
from langchain_pinecone import PineconeVectorStore
|
9 |
+
from langchain_core.documents import Document
|
10 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
11 |
+
from pinecone import Pinecone, ServerlessSpec
|
12 |
+
|
13 |
+
from dotenv import load_dotenv
|
14 |
+
|
15 |
+
load_dotenv()
|
16 |
+
|
17 |
+
index_name = os.environ.get("PINECONE_INDEX_NAME")
|
18 |
+
# namespace = os.environ.get("PINECONE_NAMESPACE")
|
19 |
+
|
20 |
+
pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
|
21 |
+
|
22 |
+
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
|
23 |
+
|
24 |
+
if index_name not in existing_indexes:
|
25 |
+
pc.create_index(
|
26 |
+
name=index_name,
|
27 |
+
dimension=1024,
|
28 |
+
metric="cosine",
|
29 |
+
spec=ServerlessSpec(cloud="aws", region="us-east-1"),
|
30 |
+
)
|
31 |
+
while not pc.describe_index(index_name).status["ready"]:
|
32 |
+
time.sleep(1)
|
33 |
+
|
34 |
+
index = pc.Index(index_name)
|
35 |
+
|
36 |
+
print(index_name)
|
37 |
+
|
38 |
+
def get_text_chunks(text):
|
39 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
40 |
+
chunk_size=500, # the character length of the chunck
|
41 |
+
chunk_overlap=100, # the character length of the overlap between chuncks
|
42 |
+
length_function=len # the length function - in this case, character length (aka the python len() fn.)
|
43 |
+
)
|
44 |
+
chunks = text_splitter.split_text(text)
|
45 |
+
return chunks
|
46 |
+
|
47 |
+
|
48 |
+
def importAideEntreprise(subvention, source):
|
49 |
+
|
50 |
+
try:
|
51 |
+
# Initialiser le modèle d'embeddings OpenAI
|
52 |
+
# embedding = OpenAIEmbeddings(model="text-embedding-ada-002") # Remplacez par votre choix de modèle
|
53 |
+
embedding = MistralAIEmbeddings(model="mistral-embed", mistral_api_key=os.environ.get("MISTRAL_API_KEY"))
|
54 |
+
vector_store = PineconeVectorStore(index=index, embedding=embedding) # namespace=namespace
|
55 |
+
|
56 |
+
|
57 |
+
if source == "aides_entreprises.json":
|
58 |
+
|
59 |
+
if 'contacts' in subvention:
|
60 |
+
del subvention['contacts']
|
61 |
+
if 'contact' in subvention:
|
62 |
+
del subvention['contact']
|
63 |
+
if 'profils' in subvention:
|
64 |
+
del subvention['profils']
|
65 |
+
if 'projets' in subvention:
|
66 |
+
del subvention['projets']
|
67 |
+
if 'cache_indexation' in subvention:
|
68 |
+
del subvention['cache_indexation']
|
69 |
+
|
70 |
+
metadata = {
|
71 |
+
**subvention.get("metadata", {}),
|
72 |
+
"id_subvention": subvention['id_aid'],
|
73 |
+
"deadline_date": subvention['date_fin'] if 'date_fin' in subvention and subvention['date_fin'] is not None else -1,
|
74 |
+
"id_document": f"entreprises_{subvention['id_aid']}"
|
75 |
+
}
|
76 |
+
elif source == "aides_territoires.json":
|
77 |
+
metadata = {
|
78 |
+
**subvention.get("metadata", {}),
|
79 |
+
"id_subvention": subvention['id'],
|
80 |
+
"deadline_date": subvention['submission_deadline'] if 'submission_deadline' in subvention and subvention['submission_deadline'] is not None else -1,
|
81 |
+
"id_document": f"territoires_{subvention['id']}"
|
82 |
+
}
|
83 |
+
elif source == "cegara_sub.json":
|
84 |
+
if 'support' in subvention:
|
85 |
+
del subvention['support']
|
86 |
+
if 'html' in subvention:
|
87 |
+
del subvention['html']
|
88 |
+
|
89 |
+
metadata = {
|
90 |
+
**subvention.get("metadata", {}),
|
91 |
+
"id_subvention": subvention['id'],
|
92 |
+
"deadline_date": subvention['validite_fin'] if 'validite_fin' in subvention and subvention['validite_fin'] is not None else -1,
|
93 |
+
"id_document": f"cegara_{subvention['id']}"
|
94 |
+
}
|
95 |
+
elif source == "les_aides.json":
|
96 |
+
|
97 |
+
if 'cci' in subvention:
|
98 |
+
del subvention['cci']
|
99 |
+
if 'url' in subvention:
|
100 |
+
del subvention['url']
|
101 |
+
|
102 |
+
metadata = {
|
103 |
+
**subvention.get("metadata", {}),
|
104 |
+
"id_subvention": subvention['numero'],
|
105 |
+
"deadline_date": -1,
|
106 |
+
"id_document": f"aides_{subvention['numero']}"
|
107 |
+
}
|
108 |
+
|
109 |
+
|
110 |
+
# Combine JSON data as a single document for embedding
|
111 |
+
json_text = json.dumps(subvention, indent=4) # Convert JSON document to string
|
112 |
+
document = Document(page_content=json_text, metadata=metadata) # Create document with metadata
|
113 |
+
|
114 |
+
# Generate a unique ID
|
115 |
+
uuid = metadata["id_document"] # Using 'id_document' as a unique ID
|
116 |
+
|
117 |
+
print("Before add_documents")
|
118 |
+
# Embed and store the document in Pinecone
|
119 |
+
vector_store.add_documents(documents=[document], ids=[uuid])
|
120 |
+
|
121 |
+
print(f"Stored document with ID: {uuid} from source: {source}")
|
122 |
+
except Exception as e:
|
123 |
+
print(f"Error storing document: {e}")
|
124 |
+
print(e)
|
125 |
+
|
126 |
+
import traceback
|
127 |
+
print(f"Error storing document: {e}")
|
128 |
+
traceback.print_exc()
|
129 |
+
|
130 |
+
|
131 |
+
def loopSubventions(subventions,source):
|
132 |
+
for subv in subventions:
|
133 |
+
importAideEntreprise(subv,source)
|
134 |
+
time.sleep(0.5)
|
135 |
+
|
136 |
+
|
137 |
+
def go():
|
138 |
+
|
139 |
+
print("Importing Aide Entreprise data...")
|
140 |
+
|
141 |
+
for file_name in os.listdir('data'):
|
142 |
+
if file_name.endswith(".json"):
|
143 |
+
print(file_name)
|
144 |
+
file_path = os.path.join('data', file_name)
|
145 |
+
with open(file_path, "r", encoding="utf-8") as f:
|
146 |
+
data = json.load(f)
|
147 |
+
|
148 |
+
loopSubventions(data,source=file_name)
|
149 |
+
|
150 |
+
if __name__ == "__main__":
|
151 |
+
go()
|