import requests from PIL import Image import pytesseract import os from langchain_huggingface import HuggingFaceEndpoint from langchain.chains import LLMChain from langchain_core.prompts import PromptTemplate import re import json api_key = os.environ.get("HFBearer") os.environ["HUGGINGFACEHUB_API_TOKEN"] = api_key # API URL and headers API_URL = "https://pllfc7e5i0rujahy.us-east-1.aws.endpoints.huggingface.cloud" # Function to extract text from image def extract_text_from_image(image): text = pytesseract.image_to_string(image) return text # Function to extract JSON from text def extract_json(text): # Use regex to find the JSON between <JSON> and </JSON> match = re.search(r'<JSON>\s*(.*?)\s*</JSON>', text, re.DOTALL) if match: json_str = match.group(1) # Get the JSON string try: # Load the JSON string into a Python dictionary json_data = json.loads(json_str) return json_data except json.JSONDecodeError: return "Erreur de décodage JSON" else: return "Aucun JSON trouvé" # Function to get metadata title from image def get_image_metadata(image): # You can customize this function to extract other metadata as needed title = image.name.split('.')[0] # Simple title extraction from file name without extension return title def count_tokens(text): return len(text.split()) image_params = { "bilan-atherosclerose": "medecin_responsable, rythme_sinusal, valeur_EIM, score_calcique", "bilan-medical": "medecin_responsable, date_naissance, prenom, nom, identifiant_patient, nom_medecin", "ECG": "medecin_responsable, poids, taille, ECG_repos_valeur_par_minute), valeur_FMT, valeur_niveau_atteint, valeur_diminution_frequence_cardiaque_bpm", "echo-doppler": "medecin_responsable, sous_clavieres, vertebrales, carotides", "echographie-poumons": "medecin_responsable, score calcique, technique, resultats", "echotomographie-abdominale": "medecin_responsable, foie, vesicule, pancreas, reins, rate, aorte_abdominale, conclusion", "echotomographie-cardiaque": "medecin_responsable, taille, poids, surface_corporelle, conclusion", "echotomographie-prostate": "medecin_responsable, vessie, ureteres, prostate, conclusion", "hematologie": "medecin_responsable, leucocytes, hematies, hemoglobines, hematocrite" } user_input = """ Vous allez extraire des paramètres d'un texte à l'intérieur d'un objet JSON, écrit entre <JSON> et </JSON>. Liste des paramètres : {parameters} Voici un exemple de réponse valide : <JSON> {{"date_naissance": "", "prenom": "", "nom": ""}} </JSON> Voici le texte à partir duquel vous devez extraire les paramètres : {texte} """ # prompt = PromptTemplate.from_template(user_input) llm = HuggingFaceEndpoint( endpoint_url=API_URL, ) # llm_chain = prompt | llm # # File uploader for multiple images # uploaded_images = st.file_uploader("Upload images", type=["png", "jpg", "jpeg"], accept_multiple_files=True) # # Modify the Streamlit section to extract the JSON for multiple images # if st.button("Submit"): # if uploaded_images: # all_json_data = {} # Dictionary to store JSON data for each image # for uploaded_image in uploaded_images: # with st.spinner(f"Extracting text from image: {uploaded_image.name}..."): # image = Image.open(uploaded_image) # extracted_text = extract_text_from_image(image) # max_text_length = 500 # Adjust as needed to keep total tokens under 1024 # if count_tokens(extracted_text) > max_text_length: # extracted_text = " ".join(extracted_text.split()[:max_text_length]) # with st.spinner(f"Fetching response from API for {uploaded_image.name}..."): # # Get metadata title from the image # title = get_image_metadata(uploaded_image) # parameters = image_params[title] # output = llm_chain.invoke({"texte": extracted_text, "parameters": parameters}) # st.success(f"Response received for {uploaded_image.name}!") # # Extract JSON from the API output # json_data = extract_json(output) # Extract JSON from the API output # all_json_data[title] = json_data # Store JSON data with title as key # st.write(title, json_data) # # Display all extracted JSON data # st.write("Extracted JSON Data for all images.") # else: # st.warning("Please upload at least one image to extract text.") def extract_json_from_images(uploaded_images): all_json_data = {} # Dictionary to store JSON data for each image for uploaded_image in uploaded_images: image = Image.open(uploaded_image) extracted_text = extract_text_from_image(image) max_text_length = 500 # Adjust as needed to keep total tokens under 1024 if count_tokens(extracted_text) > max_text_length: extracted_text = " ".join(extracted_text.split()[:max_text_length]) # Get metadata title from the image title = get_image_metadata(uploaded_image) parameters = image_params[title] # Prepare the prompt and invoke the LLM chain user_input = """ Vous allez extraire des paramètres d'un texte à l'intérieur d'un objet JSON, écrit entre <JSON> et </JSON>. Liste des paramètres : {parameters} Voici un exemple de réponse valide : <JSON> {{"date_naissance": "", "prenom": "", "nom": ""}} </JSON> Voici le texte à partir duquel vous devez extraire les paramètres : {texte} """ prompt = PromptTemplate.from_template(user_input) llm_chain = prompt | llm output = llm_chain.invoke({"texte": extracted_text, "parameters": parameters}) # Extract JSON from the API output json_data = extract_json(output) # Extract JSON from the API output all_json_data[title] = json_data # Store JSON data with title as key return all_json_data