File size: 6,196 Bytes
b65ff94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import requests
from PIL import Image
import pytesseract
import os
from langchain_huggingface import HuggingFaceEndpoint
from langchain.chains import LLMChain
from langchain_core.prompts import PromptTemplate
import re
import json

api_key = os.environ.get("HFBearer")
os.environ["HUGGINGFACEHUB_API_TOKEN"] = api_key

# API URL and headers
API_URL = "https://pllfc7e5i0rujahy.us-east-1.aws.endpoints.huggingface.cloud"

# Function to extract text from image
def extract_text_from_image(image):
    text = pytesseract.image_to_string(image)
    return text

# Function to extract JSON from text
def extract_json(text):
    # Use regex to find the JSON between <JSON> and </JSON>
    match = re.search(r'<JSON>\s*(.*?)\s*</JSON>', text, re.DOTALL)
    
    if match:
        json_str = match.group(1)  # Get the JSON string
        try:
            # Load the JSON string into a Python dictionary
            json_data = json.loads(json_str)
            return json_data
        except json.JSONDecodeError:
            return "Erreur de décodage JSON"
    else:
        return "Aucun JSON trouvé"

# Function to get metadata title from image
def get_image_metadata(image):
    # You can customize this function to extract other metadata as needed
    title = image.name.split('.')[0]  # Simple title extraction from file name without extension
    return title

def count_tokens(text):
    return len(text.split())

image_params = {
    "bilan-atherosclerose": "medecin_responsable, rythme_sinusal, valeur_EIM, score_calcique",
    "bilan-medical": "medecin_responsable, date_naissance, prenom, nom, identifiant_patient, nom_medecin",
    "ECG": "medecin_responsable, poids, taille, ECG_repos_valeur_par_minute), valeur_FMT, valeur_niveau_atteint, valeur_diminution_frequence_cardiaque_bpm",
    "echo-doppler": "medecin_responsable, sous_clavieres, vertebrales, carotides",
    "echographie-poumons": "medecin_responsable, score calcique, technique, resultats",
    "echotomographie-abdominale": "medecin_responsable, foie, vesicule, pancreas, reins, rate, aorte_abdominale, conclusion",
    "echotomographie-cardiaque": "medecin_responsable, taille, poids, surface_corporelle, conclusion",
    "echotomographie-prostate": "medecin_responsable, vessie, ureteres, prostate, conclusion",
    "hematologie": "medecin_responsable, leucocytes, hematies, hemoglobines, hematocrite"
}

user_input = """
Vous allez extraire des paramètres d'un texte à l'intérieur d'un objet JSON, écrit entre <JSON> et </JSON>.
Liste des paramètres : {parameters}

Voici un exemple de réponse valide :
<JSON>
{{"date_naissance": "", "prenom": "", "nom": ""}}
</JSON>

Voici le texte à partir duquel vous devez extraire les paramètres :
{texte}
"""

# prompt = PromptTemplate.from_template(user_input)

llm = HuggingFaceEndpoint(
    endpoint_url=API_URL,
)

# llm_chain = prompt | llm

# # File uploader for multiple images
# uploaded_images = st.file_uploader("Upload images", type=["png", "jpg", "jpeg"], accept_multiple_files=True)

# # Modify the Streamlit section to extract the JSON for multiple images
# if st.button("Submit"):
#     if uploaded_images:
#         all_json_data = {}  # Dictionary to store JSON data for each image
#         for uploaded_image in uploaded_images:
#             with st.spinner(f"Extracting text from image: {uploaded_image.name}..."):
#                 image = Image.open(uploaded_image)
#                 extracted_text = extract_text_from_image(image)

#                 max_text_length = 500  # Adjust as needed to keep total tokens under 1024
#                 if count_tokens(extracted_text) > max_text_length:
#                     extracted_text = " ".join(extracted_text.split()[:max_text_length])

#                 with st.spinner(f"Fetching response from API for {uploaded_image.name}..."):
#                     # Get metadata title from the image
#                     title = get_image_metadata(uploaded_image)
#                     parameters = image_params[title]
#                     output = llm_chain.invoke({"texte": extracted_text, "parameters": parameters})     
#                     st.success(f"Response received for {uploaded_image.name}!")

#                     # Extract JSON from the API output
#                     json_data = extract_json(output)  # Extract JSON from the API output
#                     all_json_data[title] = json_data  # Store JSON data with title as key
#                     st.write(title, json_data)
        
#         # Display all extracted JSON data
#         st.write("Extracted JSON Data for all images.")
#     else:
#         st.warning("Please upload at least one image to extract text.")



def extract_json_from_images(uploaded_images):
    all_json_data = {}  # Dictionary to store JSON data for each image

    for uploaded_image in uploaded_images:
        image = Image.open(uploaded_image)
        extracted_text = extract_text_from_image(image)

        max_text_length = 500  # Adjust as needed to keep total tokens under 1024
        if count_tokens(extracted_text) > max_text_length:
            extracted_text = " ".join(extracted_text.split()[:max_text_length])

        # Get metadata title from the image
        title = get_image_metadata(uploaded_image)
        parameters = image_params[title]

        # Prepare the prompt and invoke the LLM chain
        user_input = """
        Vous allez extraire des paramètres d'un texte à l'intérieur d'un objet JSON, écrit entre <JSON> et </JSON>.
        Liste des paramètres : {parameters}

        Voici un exemple de réponse valide :
        <JSON>
        {{"date_naissance": "", "prenom": "", "nom": ""}}
        </JSON>

        Voici le texte à partir duquel vous devez extraire les paramètres :
        {texte}
        """
        prompt = PromptTemplate.from_template(user_input)
        llm_chain = prompt | llm

        output = llm_chain.invoke({"texte": extracted_text, "parameters": parameters})

        # Extract JSON from the API output
        json_data = extract_json(output)  # Extract JSON from the API output
        all_json_data[title] = json_data  # Store JSON data with title as key

    return all_json_data