Spaces:
Sleeping
Sleeping
File size: 6,196 Bytes
b65ff94 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
import requests
from PIL import Image
import pytesseract
import os
from langchain_huggingface import HuggingFaceEndpoint
from langchain.chains import LLMChain
from langchain_core.prompts import PromptTemplate
import re
import json
api_key = os.environ.get("HFBearer")
os.environ["HUGGINGFACEHUB_API_TOKEN"] = api_key
# API URL and headers
API_URL = "https://pllfc7e5i0rujahy.us-east-1.aws.endpoints.huggingface.cloud"
# Function to extract text from image
def extract_text_from_image(image):
text = pytesseract.image_to_string(image)
return text
# Function to extract JSON from text
def extract_json(text):
# Use regex to find the JSON between <JSON> and </JSON>
match = re.search(r'<JSON>\s*(.*?)\s*</JSON>', text, re.DOTALL)
if match:
json_str = match.group(1) # Get the JSON string
try:
# Load the JSON string into a Python dictionary
json_data = json.loads(json_str)
return json_data
except json.JSONDecodeError:
return "Erreur de décodage JSON"
else:
return "Aucun JSON trouvé"
# Function to get metadata title from image
def get_image_metadata(image):
# You can customize this function to extract other metadata as needed
title = image.name.split('.')[0] # Simple title extraction from file name without extension
return title
def count_tokens(text):
return len(text.split())
image_params = {
"bilan-atherosclerose": "medecin_responsable, rythme_sinusal, valeur_EIM, score_calcique",
"bilan-medical": "medecin_responsable, date_naissance, prenom, nom, identifiant_patient, nom_medecin",
"ECG": "medecin_responsable, poids, taille, ECG_repos_valeur_par_minute), valeur_FMT, valeur_niveau_atteint, valeur_diminution_frequence_cardiaque_bpm",
"echo-doppler": "medecin_responsable, sous_clavieres, vertebrales, carotides",
"echographie-poumons": "medecin_responsable, score calcique, technique, resultats",
"echotomographie-abdominale": "medecin_responsable, foie, vesicule, pancreas, reins, rate, aorte_abdominale, conclusion",
"echotomographie-cardiaque": "medecin_responsable, taille, poids, surface_corporelle, conclusion",
"echotomographie-prostate": "medecin_responsable, vessie, ureteres, prostate, conclusion",
"hematologie": "medecin_responsable, leucocytes, hematies, hemoglobines, hematocrite"
}
user_input = """
Vous allez extraire des paramètres d'un texte à l'intérieur d'un objet JSON, écrit entre <JSON> et </JSON>.
Liste des paramètres : {parameters}
Voici un exemple de réponse valide :
<JSON>
{{"date_naissance": "", "prenom": "", "nom": ""}}
</JSON>
Voici le texte à partir duquel vous devez extraire les paramètres :
{texte}
"""
# prompt = PromptTemplate.from_template(user_input)
llm = HuggingFaceEndpoint(
endpoint_url=API_URL,
)
# llm_chain = prompt | llm
# # File uploader for multiple images
# uploaded_images = st.file_uploader("Upload images", type=["png", "jpg", "jpeg"], accept_multiple_files=True)
# # Modify the Streamlit section to extract the JSON for multiple images
# if st.button("Submit"):
# if uploaded_images:
# all_json_data = {} # Dictionary to store JSON data for each image
# for uploaded_image in uploaded_images:
# with st.spinner(f"Extracting text from image: {uploaded_image.name}..."):
# image = Image.open(uploaded_image)
# extracted_text = extract_text_from_image(image)
# max_text_length = 500 # Adjust as needed to keep total tokens under 1024
# if count_tokens(extracted_text) > max_text_length:
# extracted_text = " ".join(extracted_text.split()[:max_text_length])
# with st.spinner(f"Fetching response from API for {uploaded_image.name}..."):
# # Get metadata title from the image
# title = get_image_metadata(uploaded_image)
# parameters = image_params[title]
# output = llm_chain.invoke({"texte": extracted_text, "parameters": parameters})
# st.success(f"Response received for {uploaded_image.name}!")
# # Extract JSON from the API output
# json_data = extract_json(output) # Extract JSON from the API output
# all_json_data[title] = json_data # Store JSON data with title as key
# st.write(title, json_data)
# # Display all extracted JSON data
# st.write("Extracted JSON Data for all images.")
# else:
# st.warning("Please upload at least one image to extract text.")
def extract_json_from_images(uploaded_images):
all_json_data = {} # Dictionary to store JSON data for each image
for uploaded_image in uploaded_images:
image = Image.open(uploaded_image)
extracted_text = extract_text_from_image(image)
max_text_length = 500 # Adjust as needed to keep total tokens under 1024
if count_tokens(extracted_text) > max_text_length:
extracted_text = " ".join(extracted_text.split()[:max_text_length])
# Get metadata title from the image
title = get_image_metadata(uploaded_image)
parameters = image_params[title]
# Prepare the prompt and invoke the LLM chain
user_input = """
Vous allez extraire des paramètres d'un texte à l'intérieur d'un objet JSON, écrit entre <JSON> et </JSON>.
Liste des paramètres : {parameters}
Voici un exemple de réponse valide :
<JSON>
{{"date_naissance": "", "prenom": "", "nom": ""}}
</JSON>
Voici le texte à partir duquel vous devez extraire les paramètres :
{texte}
"""
prompt = PromptTemplate.from_template(user_input)
llm_chain = prompt | llm
output = llm_chain.invoke({"texte": extracted_text, "parameters": parameters})
# Extract JSON from the API output
json_data = extract_json(output) # Extract JSON from the API output
all_json_data[title] = json_data # Store JSON data with title as key
return all_json_data |