ia-subvention

Running

App Files Files Community

[email protected] commited on 4 days ago

Commit

6236000

1 Parent(s): 18107fd

Add script to get API data and import to vectore store

Browse files

Files changed (6) hide show

data/__init__.py +0 -0
data/aides_territoires.py +80 -0
data/get_aides_cegara.py +51 -0
data/get_aides_entreprises.py +100 -0
data/get_les_aides.py +93 -0
data/importToDb.py +151 -0

data/__init__.py ADDED Viewed

File without changes

data/aides_territoires.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import http.client
+import json
+conn = http.client.HTTPSConnection("aides-territoires.beta.gouv.fr")
+headersConnexion = {
+    'X-AUTH-TOKEN': 'eeb481e42950f1dbfc46dc348e6e32a0c631cc5b94dd7ab874a30c027f9de87c',
+}
+def connexion():
+    conn.request("POST", "/api/connexion/", '', headersConnexion)
+    data = conn.getresponse().read()
+    return json.loads(data.decode("utf-8"))['token']
+def get_final_type(types):
+    type_mapping = {
+        "Subvention": 2,
+        "Prêt": 1,
+        "Avance récupérable": 1,
+        "Certificat d'économie d'énergie (CEE)": 4,
+        "Autre aide financière": 4,
+        "Ingénierie technique": 4,
+        "Ingénierie financière": 4,
+        "Ingénierie Juridique / administrative": 4
+    }
+    final_type_mapping = {
+        1: "Avance − Prêts − Garanties",
+        2: "Subvention",
+        3: "Prise en charge des coûts et allègement des charges",
+        4: "Autres"
+    }
+    for t in types:
+        if t in type_mapping:
+            return final_type_mapping[type_mapping[t]]
+    return None  # Return None if no matching type is found
+def request():
+    subventions = []
+    page = 1
+    while True:
+        conn.request("GET", f"/api/aids/?page={page}&organization_type_slugs=farmer&perimeter_id=70971-nouvelle-aquitaine", '', headersRequest)
+        res = conn.getresponse().read()
+        resData = res.decode("utf-8")
+        data = json.loads(resData)
+        subventions += data['results']
+        if data['next'] is None:
+            break
+        page += 1
+    print(f"Total subventions : {len(data['results'])}")
+    return subventions
+token = connexion()
+headersRequest = {
+    'Authorization': 'Bearer ' + token,
+}
+aides = request()
+print(f"Nb aides : {len(aides)}")
+for aide in aides:
+    aide["metadata"] = {
+        "type_aide": get_final_type(aide["aid_types"]),
+        "lien": f"https://aides-territoires.beta.gouv.fr/aides/{aide['slug']}",
+        "Source": f"https://aides-territoires.beta.gouv.fr"
+    }
+# print(subventions)
+with open('aides_territoires.json', 'w', encoding='utf-8') as f:
+    json.dump(aides, f, ensure_ascii=False, indent=4)

data/get_aides_cegara.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import json
+import requests
+from datetime import *
+def get_final_type(types):
+    type_mapping = {
+        "Subvention": 2,
+        "Prêt": 1,
+        "Avance récupérable": 1,
+        "Certificat d'économie d'énergie (CEE)": 4,
+        "Autre aide financière": 4,
+        "Ingénierie technique": 4,
+        "Ingénierie financière": 4,
+        "Ingénierie Juridique / administrative": 4
+    }
+    final_type_mapping = {
+        1: "Avance − Prêts − Garanties",
+        2: "Subvention",
+        3: "Prise en charge des coûts et allègement des charges",
+        4: "Autres"
+    }
+    for t in types:
+        if t in type_mapping:
+            return final_type_mapping[type_mapping[t]]
+    return None  # Return None if no matching type is found
+def request():
+    subventions = requests.get('https://api.groupecegara.fr/aidesliste').json()
+    return subventions
+aides = request()
+print(f"Nb aides : {len(aides)}")
+for aide in aides:
+    aide["metadata"] = {
+        "type_aide": "Subvention",
+        "lien": f"https://www.groupecegara.fr/publications-flash.html",
+        "Source": f"https://www.groupecegara.fr/publications-flash.html"
+    }
+# print(subventions)
+# week_number = datetime.today().isocalendar()[1]
+# filename = 'cegara_sub_' + str(week_number)
+# with open('{}.json'.format(filename), 'w', encoding='utf-8') as f:
+with open('data/cegara_sub.json', 'w', encoding='utf-8') as f:
+      json.dump(aides, f, ensure_ascii=False, indent=4)

data/get_aides_entreprises.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import http.client
+import json
+conn = http.client.HTTPSConnection("api.aides-entreprises.fr")
+payload = ''
+headers = {
+    'X-Aidesentreprises-Id': 'jb4nMj67',
+    'X-Aidesentreprises-Key': 'waMF2TjO',
+}
+def request(limit=20, offset=0):
+    print(f"Requesting {limit} subventions from offset {offset}")
+    conn.request("GET", f"/v1.1/aides?profils=20&territoire=50002&limit={limit}&offset={offset}", payload, headers)
+    res = conn.getresponse()
+    data = res.read()
+    return json.loads(data.decode("utf-8"))['data']
+def get_final_type(types):
+    type_mapping = {
+        "16": 3,  # Exonération de charges sociales
+        "15": 1,  # Prêt d'honneur
+        "12": 4,  # Prix
+        "3": 2,   # Subvention
+        "5": 1,   # Prêt
+        "4": 1,   # Avance remboursable
+        "7": 1,   # Garantie
+        "8": 3,   # Allègement fiscal
+        "6": 4,   # Bonification d'intérêt
+        "9": 4,   # Participation en capital
+        "14": 4,  # Appel à projet
+        "11": 4   # Crédit-bail
+    }
+    final_type_mapping = {
+        1: "Avance − Prêts − Garanties",
+        2: "Subvention",
+        3: "Prise en charge des coûts et allègement des charges",
+        4: "Autres"
+    }
+    for t in types:
+        if t["id_typ"] in type_mapping:
+            return final_type_mapping[type_mapping[t["id_typ"]]]
+    return None  # Return None if no matching type is found
+def getAide(aide):
+    conn.request("GET", f"/v1.1/aides/{aide['id_aid']}", payload, headers)
+    res = conn.getresponse()
+    aide = json.loads(res.read().decode("utf-8"))[0]
+    # if 'contacts' in aide:
+    #     del aide['contacts']
+    # if 'contact' in aide:
+    #     del aide['contact']
+    # if 'profils' in aide:
+    #     del aide['profils']
+    # if 'projets' in aide:
+    #     del aide['projets']
+    # if 'cache_indexation' in aide:
+    #     del aide['cache_indexation']
+    print(aide["prets"])
+    aide["metadata"] = {
+        "type_aide": get_final_type(aide["prets"]),
+        "lien": f"https://aides-entreprises.fr/aide/{aide['id_aid']}",
+        "Source": f"https://aides-entreprises.fr"
+    }
+    return aide
+# Set to 400 to get all subventions, pagination doesn't work !!!
+take = 400
+skip = 0
+subventions = []
+while True:
+    responses = request(take, skip)
+    subventions += responses
+    print(f"Got {len(responses)} subventions")
+    if len(responses) < take:
+        break
+    skip += take
+for i in range(len(subventions)):
+    subventions[i] = getAide(subventions[i])
+# print(subventions)
+with open('data/aides_entreprises.json', 'w', encoding='utf-8') as f:
+    json.dump(subventions, f, ensure_ascii=False, indent=4)

data/get_les_aides.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import http.client
+import json
+conn = http.client.HTTPSConnection("api.les-aides.fr")
+payload = ''
+headers = {
+    'X-IDC': 'bcfac1828e5ef1b7cab084379a5f2a871e82ee7c',
+}
+requestID = 23994079
+filieres = [ 289, 290 ]
+domaines = [893,883,877,790,793,798,802,805,862,807,810,813,816,820,818]
+def request(filiere, domaines):
+    conn.request("GET", f"/aides?ape=A&region=75&domaine={domaines}&filiere={filiere}", payload, headers)
+    res = conn.getresponse()
+    data = res.read()
+    data = json.loads(data.decode("utf-8"))
+    print(f"Total subventions : {len(data['dispositifs'])}")
+    for i in range(len(data['dispositifs'])):
+        data['dispositifs'][i]['idr'] = data['idr']
+    return data['dispositifs']
+def get_final_type(types):
+    return None  # Return None if no matching type is found
+def getAide(aide):
+    try:
+        conn.request("GET", f"/aide/?requete={aide['idr']}&dispositif={aide['numero']}", payload, headers)
+        res = conn.getresponse()
+        aide = json.loads(res.read().decode("utf-8"))
+        # if 'cci' in aide:
+        #     del aide['cci']
+        # if 'url' in aide:
+        #     del aide['url']
+        aide["metadata"] = {
+            # "type_aide": get_final_type(aide["prets"]),
+            "lien": aide['uri'],
+            "Source": f"https://les-aides.fr/"
+        }
+        return aide
+    except Exception as e:
+        print(aide)
+        print(f"Error: {e}")
+        return None
+subventions = []
+# Split domaines into two sections
+domaines_section_2 = "[893,883,877,790,793,798,802]"
+domaines_section_1 = "[805,862,807,810,813,816,820,818]"
+# Function to add dispositifs to subventions list without duplication
+def add_dispositifs(filiere, domaines_section):
+    print(f"Requesting filiere {filiere} and domaines {domaines_section}")
+    dispositifs = request(filiere, domaines_section)
+    for dispositif in dispositifs:
+        if not any(subvention.get("numero") == dispositif["numero"] for subvention in subventions if isinstance(subvention, dict)):
+            subventions.append(dispositif)
+# Call request function with different parameters
+for filiere in filieres:
+    add_dispositifs(filiere, domaines_section_1)
+    add_dispositifs(filiere, domaines_section_2)
+print(f"Nb aides : {len(subventions)}")
+for i in range(len(subventions)):
+    print(f"{subventions[i]['numero']} : {i} / {len(subventions)}")
+    subventions[i] = getAide(subventions[i])
+# print(subventions)
+with open('data/les_aides.json', 'w', encoding='utf-8') as f:
+    json.dump(subventions, f, ensure_ascii=False, indent=4)

data/importToDb.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import os
+import json
+import time
+import pandas as pd
+import time
+from langchain_openai import OpenAIEmbeddings
+from langchain_mistralai.embeddings import MistralAIEmbeddings
+from langchain_pinecone import PineconeVectorStore
+from langchain_core.documents import Document
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from pinecone import Pinecone, ServerlessSpec
+from dotenv import load_dotenv
+load_dotenv()
+index_name = os.environ.get("PINECONE_INDEX_NAME")
+# namespace = os.environ.get("PINECONE_NAMESPACE")
+pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
+existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
+if index_name not in existing_indexes:
+    pc.create_index(
+        name=index_name,
+        dimension=1024,
+        metric="cosine",
+        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
+    )
+    while not pc.describe_index(index_name).status["ready"]:
+        time.sleep(1)
+index = pc.Index(index_name)
+print(index_name)
+def get_text_chunks(text):
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=500, # the character length of the chunck
+        chunk_overlap=100, # the character length of the overlap between chuncks
+        length_function=len # the length function - in this case, character length (aka the python len() fn.)
+    )
+    chunks = text_splitter.split_text(text)
+    return chunks
+def importAideEntreprise(subvention, source):
+    try:
+        # Initialiser le modèle d'embeddings OpenAI
+        # embedding = OpenAIEmbeddings(model="text-embedding-ada-002")  # Remplacez par votre choix de modèle
+        embedding = MistralAIEmbeddings(model="mistral-embed", mistral_api_key=os.environ.get("MISTRAL_API_KEY"))
+        vector_store = PineconeVectorStore(index=index, embedding=embedding) # namespace=namespace
+        if source == "aides_entreprises.json":
+            if 'contacts' in subvention:
+                del subvention['contacts']
+            if 'contact' in subvention:
+                del subvention['contact']
+            if 'profils' in subvention:
+                del subvention['profils']
+            if 'projets' in subvention:
+                del subvention['projets']
+            if 'cache_indexation' in subvention:
+                del subvention['cache_indexation']
+            metadata = {
+                **subvention.get("metadata", {}),
+                "id_subvention": subvention['id_aid'],
+                "deadline_date": subvention['date_fin'] if 'date_fin' in subvention and subvention['date_fin'] is not None else -1,
+                "id_document": f"entreprises_{subvention['id_aid']}"
+            }
+        elif source == "aides_territoires.json":
+            metadata = {
+                **subvention.get("metadata", {}),
+                "id_subvention": subvention['id'],
+                "deadline_date": subvention['submission_deadline'] if 'submission_deadline' in subvention and subvention['submission_deadline'] is not None else -1,
+                "id_document": f"territoires_{subvention['id']}"
+            }
+        elif source == "cegara_sub.json":
+            if 'support' in subvention:
+                del subvention['support']
+            if 'html' in subvention:
+                del subvention['html']
+            metadata = {
+                **subvention.get("metadata", {}),
+                "id_subvention": subvention['id'],
+                "deadline_date": subvention['validite_fin'] if 'validite_fin' in subvention and subvention['validite_fin'] is not None else -1,
+                "id_document": f"cegara_{subvention['id']}"
+            }
+        elif source == "les_aides.json":
+            if 'cci' in subvention:
+                del subvention['cci']
+            if 'url' in subvention:
+                del subvention['url']
+            metadata = {
+                **subvention.get("metadata", {}),
+                "id_subvention": subvention['numero'],
+                "deadline_date": -1,
+                "id_document": f"aides_{subvention['numero']}"
+            }
+        # Combine JSON data as a single document for embedding
+        json_text = json.dumps(subvention, indent=4)  # Convert JSON document to string
+        document = Document(page_content=json_text, metadata=metadata)  # Create document with metadata
+        # Generate a unique ID
+        uuid = metadata["id_document"]  # Using 'id_document' as a unique ID
+        print("Before add_documents")
+        # Embed and store the document in Pinecone
+        vector_store.add_documents(documents=[document], ids=[uuid])
+        print(f"Stored document with ID: {uuid} from source: {source}")
+    except Exception as e:
+        print(f"Error storing document: {e}")
+        print(e)
+        import traceback
+        print(f"Error storing document: {e}")
+        traceback.print_exc()
+def loopSubventions(subventions,source):
+    for subv in subventions:
+        importAideEntreprise(subv,source)
+        time.sleep(0.5)
+def go():
+    print("Importing Aide Entreprise data...")
+    for file_name in os.listdir('data'):
+        if file_name.endswith(".json"):
+            print(file_name)
+            file_path = os.path.join('data', file_name)
+            with open(file_path, "r", encoding="utf-8") as f:
+                data = json.load(f)
+                loopSubventions(data,source=file_name)
+if __name__ == "__main__":
+    go()