Spaces:

ppaihack
/

ZamaClinik

Sleeping

File size: 6,196 Bytes

b65ff94

import requests
from PIL import Image
import pytesseract
import os
from langchain_huggingface import HuggingFaceEndpoint
from langchain.chains import LLMChain
from langchain_core.prompts import PromptTemplate
import re
import json

api_key = os.environ.get("HFBearer")
os.environ["HUGGINGFACEHUB_API_TOKEN"] = api_key

# API URL and headers
API_URL = "https://pllfc7e5i0rujahy.us-east-1.aws.endpoints.huggingface.cloud"

# Function to extract text from image
def extract_text_from_image(image):
    text = pytesseract.image_to_string(image)
    return text

# Function to extract JSON from text
def extract_json(text):
    # Use regex to find the JSON between <JSON> and </JSON>
    match = re.search(r'<JSON>\s*(.*?)\s*</JSON>', text, re.DOTALL)
    
    if match:
        json_str = match.group(1)  # Get the JSON string
        try:
            # Load the JSON string into a Python dictionary
            json_data = json.loads(json_str)
            return json_data
        except json.JSONDecodeError:
            return "Erreur de décodage JSON"
    else:
        return "Aucun JSON trouvé"

# Function to get metadata title from image
def get_image_metadata(image):
    # You can customize this function to extract other metadata as needed
    title = image.name.split('.')[0]  # Simple title extraction from file name without extension
    return title

def count_tokens(text):
    return len(text.split())

image_params = {
    "bilan-atherosclerose": "medecin_responsable, rythme_sinusal, valeur_EIM, score_calcique",
    "bilan-medical": "medecin_responsable, date_naissance, prenom, nom, identifiant_patient, nom_medecin",
    "ECG": "medecin_responsable, poids, taille, ECG_repos_valeur_par_minute), valeur_FMT, valeur_niveau_atteint, valeur_diminution_frequence_cardiaque_bpm",
    "echo-doppler": "medecin_responsable, sous_clavieres, vertebrales, carotides",
    "echographie-poumons": "medecin_responsable, score calcique, technique, resultats",
    "echotomographie-abdominale": "medecin_responsable, foie, vesicule, pancreas, reins, rate, aorte_abdominale, conclusion",
    "echotomographie-cardiaque": "medecin_responsable, taille, poids, surface_corporelle, conclusion",
    "echotomographie-prostate": "medecin_responsable, vessie, ureteres, prostate, conclusion",
    "hematologie": "medecin_responsable, leucocytes, hematies, hemoglobines, hematocrite"
}

user_input = """
Vous allez extraire des paramètres d'un texte à l'intérieur d'un objet JSON, écrit entre <JSON> et </JSON>.
Liste des paramètres : {parameters}

Voici un exemple de réponse valide :
<JSON>
{{"date_naissance": "", "prenom": "", "nom": ""}}
</JSON>

Voici le texte à partir duquel vous devez extraire les paramètres :
{texte}
"""

# prompt = PromptTemplate.from_template(user_input)

llm = HuggingFaceEndpoint(
    endpoint_url=API_URL,
)

# llm_chain = prompt | llm

# # File uploader for multiple images
# uploaded_images = st.file_uploader("Upload images", type=["png", "jpg", "jpeg"], accept_multiple_files=True)

# # Modify the Streamlit section to extract the JSON for multiple images
# if st.button("Submit"):
#     if uploaded_images:
#         all_json_data = {}  # Dictionary to store JSON data for each image
#         for uploaded_image in uploaded_images:
#             with st.spinner(f"Extracting text from image: {uploaded_image.name}..."):
#                 image = Image.open(uploaded_image)
#                 extracted_text = extract_text_from_image(image)

#                 max_text_length = 500  # Adjust as needed to keep total tokens under 1024
#                 if count_tokens(extracted_text) > max_text_length:
#                     extracted_text = " ".join(extracted_text.split()[:max_text_length])

#                 with st.spinner(f"Fetching response from API for {uploaded_image.name}..."):
#                     # Get metadata title from the image
#                     title = get_image_metadata(uploaded_image)
#                     parameters = image_params[title]
#                     output = llm_chain.invoke({"texte": extracted_text, "parameters": parameters})     
#                     st.success(f"Response received for {uploaded_image.name}!")

#                     # Extract JSON from the API output
#                     json_data = extract_json(output)  # Extract JSON from the API output
#                     all_json_data[title] = json_data  # Store JSON data with title as key
#                     st.write(title, json_data)
        
#         # Display all extracted JSON data
#         st.write("Extracted JSON Data for all images.")
#     else:
#         st.warning("Please upload at least one image to extract text.")



def extract_json_from_images(uploaded_images):
    all_json_data = {}  # Dictionary to store JSON data for each image

    for uploaded_image in uploaded_images:
        image = Image.open(uploaded_image)
        extracted_text = extract_text_from_image(image)

        max_text_length = 500  # Adjust as needed to keep total tokens under 1024
        if count_tokens(extracted_text) > max_text_length:
            extracted_text = " ".join(extracted_text.split()[:max_text_length])

        # Get metadata title from the image
        title = get_image_metadata(uploaded_image)
        parameters = image_params[title]

        # Prepare the prompt and invoke the LLM chain
        user_input = """
        Vous allez extraire des paramètres d'un texte à l'intérieur d'un objet JSON, écrit entre <JSON> et </JSON>.
        Liste des paramètres : {parameters}

        Voici un exemple de réponse valide :
        <JSON>
        {{"date_naissance": "", "prenom": "", "nom": ""}}
        </JSON>

        Voici le texte à partir duquel vous devez extraire les paramètres :
        {texte}
        """
        prompt = PromptTemplate.from_template(user_input)
        llm_chain = prompt | llm

        output = llm_chain.invoke({"texte": extracted_text, "parameters": parameters})

        # Extract JSON from the API output
        json_data = extract_json(output)  # Extract JSON from the API output
        all_json_data[title] = json_data  # Store JSON data with title as key

    return all_json_data