[email protected] commited on
Commit
6236000
·
1 Parent(s): 18107fd

Add script to get API data and import to vectore store

Browse files
data/__init__.py ADDED
File without changes
data/aides_territoires.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import http.client
2
+ import json
3
+
4
+ conn = http.client.HTTPSConnection("aides-territoires.beta.gouv.fr")
5
+ headersConnexion = {
6
+ 'X-AUTH-TOKEN': 'eeb481e42950f1dbfc46dc348e6e32a0c631cc5b94dd7ab874a30c027f9de87c',
7
+ }
8
+
9
+
10
+ def connexion():
11
+ conn.request("POST", "/api/connexion/", '', headersConnexion)
12
+ data = conn.getresponse().read()
13
+ return json.loads(data.decode("utf-8"))['token']
14
+
15
+ def get_final_type(types):
16
+ type_mapping = {
17
+ "Subvention": 2,
18
+ "Prêt": 1,
19
+ "Avance récupérable": 1,
20
+ "Certificat d'économie d'énergie (CEE)": 4,
21
+ "Autre aide financière": 4,
22
+ "Ingénierie technique": 4,
23
+ "Ingénierie financière": 4,
24
+ "Ingénierie Juridique / administrative": 4
25
+ }
26
+
27
+ final_type_mapping = {
28
+ 1: "Avance − Prêts − Garanties",
29
+ 2: "Subvention",
30
+ 3: "Prise en charge des coûts et allègement des charges",
31
+ 4: "Autres"
32
+ }
33
+
34
+ for t in types:
35
+ if t in type_mapping:
36
+ return final_type_mapping[type_mapping[t]]
37
+
38
+ return None # Return None if no matching type is found
39
+
40
+
41
+ def request():
42
+ subventions = []
43
+ page = 1
44
+
45
+ while True:
46
+ conn.request("GET", f"/api/aids/?page={page}&organization_type_slugs=farmer&perimeter_id=70971-nouvelle-aquitaine", '', headersRequest)
47
+ res = conn.getresponse().read()
48
+ resData = res.decode("utf-8")
49
+ data = json.loads(resData)
50
+
51
+ subventions += data['results']
52
+
53
+ if data['next'] is None:
54
+ break
55
+
56
+ page += 1
57
+
58
+ print(f"Total subventions : {len(data['results'])}")
59
+ return subventions
60
+
61
+
62
+ token = connexion()
63
+ headersRequest = {
64
+ 'Authorization': 'Bearer ' + token,
65
+ }
66
+
67
+ aides = request()
68
+ print(f"Nb aides : {len(aides)}")
69
+
70
+ for aide in aides:
71
+ aide["metadata"] = {
72
+ "type_aide": get_final_type(aide["aid_types"]),
73
+ "lien": f"https://aides-territoires.beta.gouv.fr/aides/{aide['slug']}",
74
+ "Source": f"https://aides-territoires.beta.gouv.fr"
75
+ }
76
+
77
+
78
+ # print(subventions)
79
+ with open('aides_territoires.json', 'w', encoding='utf-8') as f:
80
+ json.dump(aides, f, ensure_ascii=False, indent=4)
data/get_aides_cegara.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import requests
3
+ from datetime import *
4
+
5
+ def get_final_type(types):
6
+ type_mapping = {
7
+ "Subvention": 2,
8
+ "Prêt": 1,
9
+ "Avance récupérable": 1,
10
+ "Certificat d'économie d'énergie (CEE)": 4,
11
+ "Autre aide financière": 4,
12
+ "Ingénierie technique": 4,
13
+ "Ingénierie financière": 4,
14
+ "Ingénierie Juridique / administrative": 4
15
+ }
16
+
17
+ final_type_mapping = {
18
+ 1: "Avance − Prêts − Garanties",
19
+ 2: "Subvention",
20
+ 3: "Prise en charge des coûts et allègement des charges",
21
+ 4: "Autres"
22
+ }
23
+
24
+ for t in types:
25
+ if t in type_mapping:
26
+ return final_type_mapping[type_mapping[t]]
27
+
28
+ return None # Return None if no matching type is found
29
+
30
+
31
+ def request():
32
+ subventions = requests.get('https://api.groupecegara.fr/aidesliste').json()
33
+ return subventions
34
+
35
+ aides = request()
36
+ print(f"Nb aides : {len(aides)}")
37
+
38
+ for aide in aides:
39
+ aide["metadata"] = {
40
+ "type_aide": "Subvention",
41
+ "lien": f"https://www.groupecegara.fr/publications-flash.html",
42
+ "Source": f"https://www.groupecegara.fr/publications-flash.html"
43
+ }
44
+
45
+
46
+ # print(subventions)
47
+ # week_number = datetime.today().isocalendar()[1]
48
+ # filename = 'cegara_sub_' + str(week_number)
49
+ # with open('{}.json'.format(filename), 'w', encoding='utf-8') as f:
50
+ with open('data/cegara_sub.json', 'w', encoding='utf-8') as f:
51
+ json.dump(aides, f, ensure_ascii=False, indent=4)
data/get_aides_entreprises.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import http.client
2
+ import json
3
+
4
+ conn = http.client.HTTPSConnection("api.aides-entreprises.fr")
5
+ payload = ''
6
+ headers = {
7
+ 'X-Aidesentreprises-Id': 'jb4nMj67',
8
+ 'X-Aidesentreprises-Key': 'waMF2TjO',
9
+ }
10
+
11
+
12
+ def request(limit=20, offset=0):
13
+ print(f"Requesting {limit} subventions from offset {offset}")
14
+ conn.request("GET", f"/v1.1/aides?profils=20&territoire=50002&limit={limit}&offset={offset}", payload, headers)
15
+ res = conn.getresponse()
16
+ data = res.read()
17
+ return json.loads(data.decode("utf-8"))['data']
18
+
19
+
20
+ def get_final_type(types):
21
+ type_mapping = {
22
+ "16": 3, # Exonération de charges sociales
23
+ "15": 1, # Prêt d'honneur
24
+ "12": 4, # Prix
25
+ "3": 2, # Subvention
26
+ "5": 1, # Prêt
27
+ "4": 1, # Avance remboursable
28
+ "7": 1, # Garantie
29
+ "8": 3, # Allègement fiscal
30
+ "6": 4, # Bonification d'intérêt
31
+ "9": 4, # Participation en capital
32
+ "14": 4, # Appel à projet
33
+ "11": 4 # Crédit-bail
34
+ }
35
+
36
+ final_type_mapping = {
37
+ 1: "Avance − Prêts − Garanties",
38
+ 2: "Subvention",
39
+ 3: "Prise en charge des coûts et allègement des charges",
40
+ 4: "Autres"
41
+ }
42
+
43
+ for t in types:
44
+ if t["id_typ"] in type_mapping:
45
+ return final_type_mapping[type_mapping[t["id_typ"]]]
46
+
47
+ return None # Return None if no matching type is found
48
+
49
+
50
+ def getAide(aide):
51
+
52
+ conn.request("GET", f"/v1.1/aides/{aide['id_aid']}", payload, headers)
53
+ res = conn.getresponse()
54
+ aide = json.loads(res.read().decode("utf-8"))[0]
55
+
56
+ # if 'contacts' in aide:
57
+ # del aide['contacts']
58
+ # if 'contact' in aide:
59
+ # del aide['contact']
60
+ # if 'profils' in aide:
61
+ # del aide['profils']
62
+ # if 'projets' in aide:
63
+ # del aide['projets']
64
+ # if 'cache_indexation' in aide:
65
+ # del aide['cache_indexation']
66
+
67
+ print(aide["prets"])
68
+ aide["metadata"] = {
69
+ "type_aide": get_final_type(aide["prets"]),
70
+ "lien": f"https://aides-entreprises.fr/aide/{aide['id_aid']}",
71
+ "Source": f"https://aides-entreprises.fr"
72
+ }
73
+
74
+ return aide
75
+
76
+ # Set to 400 to get all subventions, pagination doesn't work !!!
77
+ take = 400
78
+ skip = 0
79
+
80
+ subventions = []
81
+
82
+ while True:
83
+
84
+ responses = request(take, skip)
85
+ subventions += responses
86
+
87
+ print(f"Got {len(responses)} subventions")
88
+ if len(responses) < take:
89
+ break
90
+
91
+ skip += take
92
+
93
+
94
+ for i in range(len(subventions)):
95
+ subventions[i] = getAide(subventions[i])
96
+
97
+
98
+ # print(subventions)
99
+ with open('data/aides_entreprises.json', 'w', encoding='utf-8') as f:
100
+ json.dump(subventions, f, ensure_ascii=False, indent=4)
data/get_les_aides.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import http.client
2
+ import json
3
+
4
+ conn = http.client.HTTPSConnection("api.les-aides.fr")
5
+ payload = ''
6
+ headers = {
7
+ 'X-IDC': 'bcfac1828e5ef1b7cab084379a5f2a871e82ee7c',
8
+ }
9
+
10
+ requestID = 23994079
11
+ filieres = [ 289, 290 ]
12
+ domaines = [893,883,877,790,793,798,802,805,862,807,810,813,816,820,818]
13
+
14
+ def request(filiere, domaines):
15
+ conn.request("GET", f"/aides?ape=A&region=75&domaine={domaines}&filiere={filiere}", payload, headers)
16
+ res = conn.getresponse()
17
+ data = res.read()
18
+ data = json.loads(data.decode("utf-8"))
19
+
20
+ print(f"Total subventions : {len(data['dispositifs'])}")
21
+
22
+ for i in range(len(data['dispositifs'])):
23
+ data['dispositifs'][i]['idr'] = data['idr']
24
+
25
+ return data['dispositifs']
26
+
27
+
28
+ def get_final_type(types):
29
+
30
+
31
+ return None # Return None if no matching type is found
32
+
33
+
34
+ def getAide(aide):
35
+
36
+ try:
37
+ conn.request("GET", f"/aide/?requete={aide['idr']}&dispositif={aide['numero']}", payload, headers)
38
+ res = conn.getresponse()
39
+ aide = json.loads(res.read().decode("utf-8"))
40
+
41
+ # if 'cci' in aide:
42
+ # del aide['cci']
43
+ # if 'url' in aide:
44
+ # del aide['url']
45
+
46
+
47
+ aide["metadata"] = {
48
+ # "type_aide": get_final_type(aide["prets"]),
49
+ "lien": aide['uri'],
50
+ "Source": f"https://les-aides.fr/"
51
+ }
52
+
53
+ return aide
54
+ except Exception as e:
55
+ print(aide)
56
+ print(f"Error: {e}")
57
+ return None
58
+
59
+
60
+ subventions = []
61
+
62
+
63
+ # Split domaines into two sections
64
+ domaines_section_2 = "[893,883,877,790,793,798,802]"
65
+ domaines_section_1 = "[805,862,807,810,813,816,820,818]"
66
+
67
+
68
+ # Function to add dispositifs to subventions list without duplication
69
+ def add_dispositifs(filiere, domaines_section):
70
+ print(f"Requesting filiere {filiere} and domaines {domaines_section}")
71
+
72
+ dispositifs = request(filiere, domaines_section)
73
+ for dispositif in dispositifs:
74
+ if not any(subvention.get("numero") == dispositif["numero"] for subvention in subventions if isinstance(subvention, dict)):
75
+ subventions.append(dispositif)
76
+
77
+
78
+ # Call request function with different parameters
79
+ for filiere in filieres:
80
+ add_dispositifs(filiere, domaines_section_1)
81
+ add_dispositifs(filiere, domaines_section_2)
82
+
83
+
84
+ print(f"Nb aides : {len(subventions)}")
85
+
86
+ for i in range(len(subventions)):
87
+ print(f"{subventions[i]['numero']} : {i} / {len(subventions)}")
88
+ subventions[i] = getAide(subventions[i])
89
+
90
+
91
+ # print(subventions)
92
+ with open('data/les_aides.json', 'w', encoding='utf-8') as f:
93
+ json.dump(subventions, f, ensure_ascii=False, indent=4)
data/importToDb.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import time
4
+ import pandas as pd
5
+ import time
6
+ from langchain_openai import OpenAIEmbeddings
7
+ from langchain_mistralai.embeddings import MistralAIEmbeddings
8
+ from langchain_pinecone import PineconeVectorStore
9
+ from langchain_core.documents import Document
10
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+ from pinecone import Pinecone, ServerlessSpec
12
+
13
+ from dotenv import load_dotenv
14
+
15
+ load_dotenv()
16
+
17
+ index_name = os.environ.get("PINECONE_INDEX_NAME")
18
+ # namespace = os.environ.get("PINECONE_NAMESPACE")
19
+
20
+ pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
21
+
22
+ existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
23
+
24
+ if index_name not in existing_indexes:
25
+ pc.create_index(
26
+ name=index_name,
27
+ dimension=1024,
28
+ metric="cosine",
29
+ spec=ServerlessSpec(cloud="aws", region="us-east-1"),
30
+ )
31
+ while not pc.describe_index(index_name).status["ready"]:
32
+ time.sleep(1)
33
+
34
+ index = pc.Index(index_name)
35
+
36
+ print(index_name)
37
+
38
+ def get_text_chunks(text):
39
+ text_splitter = RecursiveCharacterTextSplitter(
40
+ chunk_size=500, # the character length of the chunck
41
+ chunk_overlap=100, # the character length of the overlap between chuncks
42
+ length_function=len # the length function - in this case, character length (aka the python len() fn.)
43
+ )
44
+ chunks = text_splitter.split_text(text)
45
+ return chunks
46
+
47
+
48
+ def importAideEntreprise(subvention, source):
49
+
50
+ try:
51
+ # Initialiser le modèle d'embeddings OpenAI
52
+ # embedding = OpenAIEmbeddings(model="text-embedding-ada-002") # Remplacez par votre choix de modèle
53
+ embedding = MistralAIEmbeddings(model="mistral-embed", mistral_api_key=os.environ.get("MISTRAL_API_KEY"))
54
+ vector_store = PineconeVectorStore(index=index, embedding=embedding) # namespace=namespace
55
+
56
+
57
+ if source == "aides_entreprises.json":
58
+
59
+ if 'contacts' in subvention:
60
+ del subvention['contacts']
61
+ if 'contact' in subvention:
62
+ del subvention['contact']
63
+ if 'profils' in subvention:
64
+ del subvention['profils']
65
+ if 'projets' in subvention:
66
+ del subvention['projets']
67
+ if 'cache_indexation' in subvention:
68
+ del subvention['cache_indexation']
69
+
70
+ metadata = {
71
+ **subvention.get("metadata", {}),
72
+ "id_subvention": subvention['id_aid'],
73
+ "deadline_date": subvention['date_fin'] if 'date_fin' in subvention and subvention['date_fin'] is not None else -1,
74
+ "id_document": f"entreprises_{subvention['id_aid']}"
75
+ }
76
+ elif source == "aides_territoires.json":
77
+ metadata = {
78
+ **subvention.get("metadata", {}),
79
+ "id_subvention": subvention['id'],
80
+ "deadline_date": subvention['submission_deadline'] if 'submission_deadline' in subvention and subvention['submission_deadline'] is not None else -1,
81
+ "id_document": f"territoires_{subvention['id']}"
82
+ }
83
+ elif source == "cegara_sub.json":
84
+ if 'support' in subvention:
85
+ del subvention['support']
86
+ if 'html' in subvention:
87
+ del subvention['html']
88
+
89
+ metadata = {
90
+ **subvention.get("metadata", {}),
91
+ "id_subvention": subvention['id'],
92
+ "deadline_date": subvention['validite_fin'] if 'validite_fin' in subvention and subvention['validite_fin'] is not None else -1,
93
+ "id_document": f"cegara_{subvention['id']}"
94
+ }
95
+ elif source == "les_aides.json":
96
+
97
+ if 'cci' in subvention:
98
+ del subvention['cci']
99
+ if 'url' in subvention:
100
+ del subvention['url']
101
+
102
+ metadata = {
103
+ **subvention.get("metadata", {}),
104
+ "id_subvention": subvention['numero'],
105
+ "deadline_date": -1,
106
+ "id_document": f"aides_{subvention['numero']}"
107
+ }
108
+
109
+
110
+ # Combine JSON data as a single document for embedding
111
+ json_text = json.dumps(subvention, indent=4) # Convert JSON document to string
112
+ document = Document(page_content=json_text, metadata=metadata) # Create document with metadata
113
+
114
+ # Generate a unique ID
115
+ uuid = metadata["id_document"] # Using 'id_document' as a unique ID
116
+
117
+ print("Before add_documents")
118
+ # Embed and store the document in Pinecone
119
+ vector_store.add_documents(documents=[document], ids=[uuid])
120
+
121
+ print(f"Stored document with ID: {uuid} from source: {source}")
122
+ except Exception as e:
123
+ print(f"Error storing document: {e}")
124
+ print(e)
125
+
126
+ import traceback
127
+ print(f"Error storing document: {e}")
128
+ traceback.print_exc()
129
+
130
+
131
+ def loopSubventions(subventions,source):
132
+ for subv in subventions:
133
+ importAideEntreprise(subv,source)
134
+ time.sleep(0.5)
135
+
136
+
137
+ def go():
138
+
139
+ print("Importing Aide Entreprise data...")
140
+
141
+ for file_name in os.listdir('data'):
142
+ if file_name.endswith(".json"):
143
+ print(file_name)
144
+ file_path = os.path.join('data', file_name)
145
+ with open(file_path, "r", encoding="utf-8") as f:
146
+ data = json.load(f)
147
+
148
+ loopSubventions(data,source=file_name)
149
+
150
+ if __name__ == "__main__":
151
+ go()