Spaces:

salvinjose
/

HNTAI

Paused

App Files Files Community

sachinchandrankallar commited on Jun 5

Commit

4abf821

1 Parent(s): 89a714b

Initial commit to Hugging Face Space

Browse files

Files changed (21) hide show

Dockerfile +38 -0
ai_med_extract/__init__.py +1 -0
ai_med_extract/__main__.py +5 -0
ai_med_extract/__pycache__/__init__.cpython-313.pyc +0 -0
ai_med_extract/__pycache__/__main__.cpython-313.pyc +0 -0
ai_med_extract/__pycache__/app.cpython-313.pyc +0 -0
ai_med_extract/agents/__init__.py +1 -0
ai_med_extract/agents/medical_data_extractor.py +29 -0
ai_med_extract/agents/phi_scrubber.py +16 -0
ai_med_extract/agents/summarizer.py +14 -0
ai_med_extract/agents/text_extractor.py +66 -0
ai_med_extract/api/__init__.py +1 -0
ai_med_extract/api/routes.py +132 -0
ai_med_extract/app.py +43 -0
ai_med_extract/utils/__init__.py +1 -0
ai_med_extract/utils/file_utils.py +53 -0
ai_med_extract/utils/validation.py +40 -0
combined1.py +880 -0
document_based_extraction.py +1188 -0
requirements.txt +16 -0
speech_to_chart.py +638 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,38 @@

+# Use a lightweight Python base image
+FROM python:3.10-slim
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    pkg-config \
+    libsystemd-dev \
+    libcairo2-dev \
+    tesseract-ocr \
+    libglib2.0-0 \
+    libsm6 \
+    libxrender1 \
+    libxext6 \
+    poppler-utils \
+    gettext \
+    && rm -rf /var/lib/apt/lists/*
+# Set the working directory
+WORKDIR /app
+# Copy only dependency files first for better caching
+COPY requirements.txt .
+# Install pip and dependencies
+RUN pip install --upgrade pip \
+ && pip install torch==2.6.0 --no-cache-dir \
+ && pip install -r requirements.txt --no-cache-dir
+# Copy rest of your code (this is after deps so doesn't bust cache)
+COPY . .
+# Expose port 7860 (required by HF Spaces)
+EXPOSE 7860
+# Run the Flask app
+CMD ["python", "-m", "ai_med_extract", "--port=7860", "--host=0.0.0.0"]

ai_med_extract/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # ai_med_extract/__init__.py

ai_med_extract/__main__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .app import app
+# Entrypoint for running the app as a module
+if __name__ == "__main__":
+    app.run(host="0.0.0.0", port=5000, debug=True)

ai_med_extract/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (134 Bytes). View file

ai_med_extract/__pycache__/__main__.cpython-313.pyc ADDED Viewed

Binary file (305 Bytes). View file

ai_med_extract/__pycache__/app.cpython-313.pyc ADDED Viewed

Binary file (2.19 kB). View file

ai_med_extract/agents/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # ai_med_extract/agents/__init__.py

ai_med_extract/agents/medical_data_extractor.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import logging
+import json
+class MedicalDataExtractorAgent:
+    def __init__(self, gen_model_loader):
+        self.gen_model_loader = gen_model_loader
+    def extract_medical_data(self, text):
+        try:
+            generator = self.gen_model_loader.load()
+            prompt = (
+                "Extract structured medical information from the following clinical note.\n\n"
+                "Return the result in JSON format with the following fields:\n"
+                "patient_condition, symptoms, current_problems, allergies, dr_notes, "
+                "prescription, investigations, follow_up_instructions.\n\n"
+                f"Clinical Note:\n{text}\n\n"
+                "Structured JSON Output:\n"
+            )
+            response = generator(prompt, max_new_tokens=256)[0]["generated_text"]
+            logging.debug(f"Raw model output: {response}")
+            json_start = response.find("{")
+            json_end = response.rfind("}") + 1
+            if json_start == -1 or json_end == -1:
+                raise ValueError("No JSON found in the model response.")
+            json_str = response[json_start:json_end]
+            return json.loads(json_str)
+        except Exception as e:
+            logging.error(f"Error extracting medical data: {e}")
+            return {"error": f"Failed to extract medical data: {str(e)}"}

ai_med_extract/agents/phi_scrubber.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import re
+import logging
+class PHIScrubberAgent:
+    @staticmethod
+    def scrub_phi(text):
+        try:
+            text = re.sub(r'\b(?:\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}\b', '[PHONE]', text)
+            text = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w{2,4}\b', '[EMAIL]', text)
+            text = re.sub(r'\b\d{3}-\d{2}-\d{4}\b', '[SSN]', text)
+            text = re.sub(r'\b\d{1,5}\s+\w+\s+(Street|St|Avenue|Ave|Boulevard|Blvd|Road|Rd|Lane|Ln)\b', '[ADDRESS]', text, flags=re.IGNORECASE)
+            text = re.sub(r'\bDr\.?\s+[A-Z][a-z]+\s+[A-Z][a-z]+\b', 'Dr. [NAME]', text)
+            text = re.sub(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', '[NAME]', text)
+        except Exception as e:
+            logging.error(f"PHI scrubbing failed: {e}")
+        return text

ai_med_extract/agents/summarizer.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import logging
+class SummarizerAgent:
+    def __init__(self, summarization_model_loader):
+        self.summarization_model_loader = summarization_model_loader
+    def generate_summary(self, text):
+        model = self.summarization_model_loader.load()
+        try:
+            summary_result = model(text, max_length=150, min_length=30, do_sample=False)
+            return summary_result[0]['summary_text'].strip()
+        except Exception as e:
+            logging.error(f"Summary generation failed: {e}")
+            return "Summary generation failed."

ai_med_extract/agents/text_extractor.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import pdfplumber
+import pytesseract
+import cv2
+import pandas as pd
+from PIL import Image
+from docx import Document
+import tempfile
+import os
+import logging
+class TextExtractorAgent:
+    @staticmethod
+    def extract_text(filepath, ext):
+        try:
+            if ext == "pdf":
+                return TextExtractorAgent.extract_text_from_pdf(filepath)
+            elif ext in {"jpg", "jpeg", "png"}:
+                return TextExtractorAgent.extract_text_from_image(filepath)
+            elif ext == "docx":
+                return TextExtractorAgent.extract_text_from_docx(filepath)
+            elif ext in {"xlsx", "xls"}:
+                return TextExtractorAgent.extract_text_from_excel(filepath)
+            return None
+        except Exception as e:
+            logging.error(f"Text extraction failed: {e}")
+            return None
+    @staticmethod
+    def extract_text_from_pdf(filepath, password=None):
+        text = ""
+        with pdfplumber.open(filepath) as pdf:
+            for page in pdf.pages:
+                page_text = page.extract_text()
+                if page_text:
+                    text += page_text + "\n"
+        return text.strip() or None
+    @staticmethod
+    def extract_text_from_image(filepath):
+        image = cv2.imread(filepath)
+        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        _, processed = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
+            processed_path = temp_file.name
+        cv2.imwrite(processed_path, processed)
+        text = pytesseract.image_to_string(Image.open(processed_path), lang='eng')
+        os.remove(processed_path)
+        return text.strip() or None
+    @staticmethod
+    def extract_text_from_docx(filepath):
+        doc = Document(filepath)
+        text = "\n".join([para.text for para in doc.paragraphs])
+        return text.strip() or None
+    @staticmethod
+    def extract_text_from_excel(filepath):
+        dfs = pd.read_excel(filepath, sheet_name=None)
+        text = "\n".join([
+            "\n".join([
+                " ".join(map(str, df[col].dropna()))
+                for col in df.columns
+            ])
+            for df in dfs.values()
+        ])
+        return text.strip() or None

ai_med_extract/api/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # ai_med_extract/api/__init__.py

ai_med_extract/api/routes.py ADDED Viewed

	@@ -0,0 +1,132 @@

+from flask import request, jsonify, abort, current_app
+from . import app
+from ..agents.text_extractor import TextExtractorAgent
+from ..agents.phi_scrubber import PHIScrubberAgent
+from ..agents.summarizer import SummarizerAgent
+from ..agents.medical_data_extractor import MedicalDataExtractorAgent
+from ..utils.file_utils import allowed_file, check_file_size, save_data_to_storage, get_data_from_storage
+from ..utils.validation import clean_result, validate_patient_name
+import os
+import logging
+@app.route("/upload", methods=["POST"])
+def upload_file():
+    files = request.files.getlist("file")
+    patient_name = request.form.get("patient_name", "").strip()
+    password = request.form.get("password")
+    qa_model_name = request.form.get("qa_model_name")
+    qa_model_type = request.form.get("qa_model_type")
+    ner_model_name = request.form.get("ner_model_name")
+    ner_model_type = request.form.get("ner_model_type")
+    summarizer_model_name = request.form.get("summarizer_model_name")
+    summarizer_model_type = request.form.get("summarizer_model_type")
+    if not files:
+        return jsonify({"error": "No file uploaded"}), 400
+    # Model loading (example, adjust as needed)
+    try:
+        qa_pipeline = pipeline(task=qa_model_type, model=qa_model_name)
+    except Exception as e:
+        return jsonify({"error": f"QA model load failed: {str(e)}"}), 500
+    try:
+        ner_pipeline = pipeline(task=ner_model_type, model=ner_model_name)
+    except Exception as e:
+        return jsonify({"error": f"NER model load failed: {str(e)}"}), 500
+    try:
+        summarizer_pipeline = pipeline(task=summarizer_model_type, model=summarizer_model_name)
+    except Exception as e:
+        return jsonify({"error": f"Summarizer model load failed: {str(e)}"}), 500
+    extracted_data = []
+    for file in files:
+        if file.filename == '':
+            continue
+        if not allowed_file(file.filename):
+            return jsonify({"error": f"Unsupported file type: {file.filename}. Supported file types are: {', '.join(allowed_file.ALLOWED_EXTENSIONS)}"}), 400
+        if not patient_name:
+            return jsonify({"error": "Patient name is missing"}), 400
+        valid_size, error_message = check_file_size(file)
+        if not valid_size:
+            return jsonify({"error": error_message}), 400
+        filename = file.filename
+        filepath = os.path.join(current_app.config['UPLOAD_FOLDER'], filename)
+        file.save(filepath)
+        ext = filename.rsplit('.', 1)[-1].lower()
+        extracted_text = TextExtractorAgent.extract_text(filepath, ext)
+        if not extracted_text or extracted_text == "No text found":
+            return jsonify({"error": f"Failed to extract text from {filename}"}), 415
+        skip_medical_check = request.form.get("skip_medical_check", "false").lower() == "true"
+        if not skip_medical_check:
+            ner_results = ner_pipeline(extracted_text)
+            medical_entities = list(set([r["word"] for r in ner_results if r["entity"].startswith("B-") or r["entity"].startswith("I-")]))
+            if not medical_entities:
+                return jsonify({"error": f"'{filename}' is not medically relevant"}), 406
+        skip_patient_check = request.form.get("skip_patient_check", "false").lower() == "true"
+        if not skip_patient_check:
+            try:
+                error_response = validate_patient_name(extracted_text, patient_name, filename, qa_pipeline)
+                if error_response:
+                    return error_response
+            except Exception as e:
+                return jsonify({"error": f"Patient name validation failed: {str(e)}"}), 500
+        try:
+            summary = summarizer_pipeline(extracted_text, max_length=350, min_length=50, do_sample=False)[0]["summary_text"]
+        except Exception as e:
+            summary = "Summary failed"
+        extracted_data.append({
+            "file": filename,
+            "extracted_text": extracted_text,
+            "summary": summary,
+            "message": "Successful"
+        })
+    if not extracted_data:
+        return jsonify({"error": "No valid medical files processed"}), 400
+    return jsonify({"extracted_data": extracted_data}), 200
+@app.route("/get_updated_medical_data", methods=["GET"])
+def get_updated_data():
+    file_name = request.args.get('file')
+    if not file_name:
+        return jsonify({"error": "File name is required"}), 400
+    file_name = file_name.rsplit(".", 1)[0]
+    updated_data = get_data_from_storage(file_name)
+    if updated_data:
+        return jsonify({"file": file_name, "data": updated_data}), 200
+    else:
+        return jsonify({"error": f"File '{file_name}' not found"}), 404
+@app.route("/update_medical_data", methods=["PUT"])
+def update_medical_data():
+    try:
+        data = request.json
+        filename = data.get("file")
+        filename = filename.rsplit(".", 1)[0]
+        updates = data.get("updates", [])
+        if not filename or not updates:
+            return jsonify({"error": "File name or updates missing"}), 400
+        existing_data = get_data_from_storage(filename)
+        if not existing_data:
+            return jsonify({"error": f"File '{filename}' not found"}), 404
+        for update in updates:
+            category = update.get("category")
+            field = update.get("field")
+            new_value = update.get("value")
+            updated = False
+            for cat in existing_data.get("extracted_data", []):
+                for categorized_data in cat.get("categorized_data", []):
+                    if categorized_data.get("name") == category:
+                        for fld in categorized_data.get("fields", []):
+                            if fld.get("label") == field:
+                                fld["value"] = new_value
+                                updated = True
+                                break
+                    if updated:
+                        break
+                if updated:
+                    break
+        save_data_to_storage(filename, existing_data)
+        return jsonify({"message": "Data updated successfully", "updated_data": existing_data}), 200
+    except Exception as e:
+        return jsonify({"error": str(e)}), 500
+@app.route("/")
+def home():
+    return "Medical Data Extraction API is running!"

ai_med_extract/app.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import os
+import logging
+from flask import Flask, request, jsonify, abort
+from flask_cors import CORS
+from werkzeug.utils import secure_filename
+from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
+import whisper
+from dotenv import load_dotenv
+from .agents.text_extractor import TextExtractorAgent
+from .agents.phi_scrubber import PHIScrubberAgent
+from .agents.summarizer import SummarizerAgent
+from .agents.medical_data_extractor import MedicalDataExtractorAgent
+from .utils.file_utils import allowed_file, check_file_size, save_data_to_storage, get_data_from_storage
+from .utils.validation import clean_result, validate_patient_name
+# Load environment variables
+load_dotenv()
+app = Flask(__name__)
+CORS(app)
+UPLOAD_DIR = os.getenv('UPLOAD_DIR', os.path.join(os.getcwd(), 'uploads'))
+os.makedirs(UPLOAD_DIR, exist_ok=True)
+app.config['UPLOAD_FOLDER'] = UPLOAD_DIR
+app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024  # 16 MB max file size
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+# Model loaders (example, adjust as needed)
+medalpaca_model_loader = None  # TODO: Implement LazyModelLoader if needed
+summarization_model_loader = None  # TODO: Implement LazyModelLoader if needed
+whisper_model = whisper.load_model("tiny")
+# Initialize agents
+text_extractor_agent = TextExtractorAgent()
+phi_scrubber_agent = PHIScrubberAgent()
+medical_data_extractor_agent = MedicalDataExtractorAgent(medalpaca_model_loader)
+summarizer_agent = SummarizerAgent(summarization_model_loader)
+from .api import routes  # Import routes to register endpoints
+if __name__ == "__main__":
+    app.run(host="0.0.0.0", port=5000, debug=True)

ai_med_extract/utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # ai_med_extract/utils/__init__.py

ai_med_extract/utils/file_utils.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os
+import re
+import json
+import logging
+from werkzeug.utils import secure_filename
+from flask import current_app
+ALLOWED_EXTENSIONS = {"pdf", "jpg", "jpeg", "png", "svg", "docx", "doc", "xlsx", "xls"}
+MAX_SIZE_PDF_DOCS = 1 * 1024 * 1024 * 1024  # 1GB
+MAX_SIZE_IMAGES = 500 * 1024 * 1024  # 500MB
+def allowed_file(filename):
+    return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
+def check_file_size(file):
+    file.seek(0, os.SEEK_END)
+    size = file.tell()
+    file.seek(0)
+    extension = file.filename.rsplit('.', 1)[-1].lower()
+    if extension in {"pdf", "docx"} and size > MAX_SIZE_PDF_DOCS:
+        return False, f"File {file.filename} exceeds 1GB size limit"
+    elif extension in {"jpg", "jpeg", "png"} and size > MAX_SIZE_IMAGES:
+        return False, f"Image {file.filename} exceeds 500MB size limit"
+    return True, None
+def save_data_to_storage(filename, data):
+    try:
+        upload_folder = current_app.config.get("UPLOAD_FOLDER", "uploads")
+        if not os.path.exists(upload_folder):
+            os.makedirs(upload_folder, exist_ok=True)
+        filename = filename.rsplit(".", 1)[0]
+        filepath = os.path.join(upload_folder, f"{filename}.json")
+        with open(filepath, "w") as file:
+            json.dump(data, file)
+    except Exception as e:
+        logging.error(f"Exception during save: {e}")
+def get_data_from_storage(filename):
+    try:
+        upload_folder = current_app.config.get("UPLOAD_FOLDER", "uploads")
+        filepath = os.path.join(upload_folder, f"{filename}.json")
+        if not os.path.exists(filepath):
+            return None
+        with open(filepath, "r") as file:
+            data = json.load(file)
+        return data
+    except Exception as e:
+        logging.error(f"Error loading data: {e}")
+        return None

ai_med_extract/utils/validation.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import re
+from flask import jsonify
+def clean_result(value):
+    value = re.sub(r"\s+", " ", value)
+    value = re.sub(r"[-_:]+", " ", value)
+    value = re.sub(r"[^\x00-\x7F]+", " ", value)
+    return value if value else "Not Available"
+def normalize_name(name):
+    if not name:
+        return ""
+    name = name.lower().strip()
+    name = re.sub(r"[^\w\s]", "", name)
+    name = re.sub(r"^\b\w{1,5}\b\s+", "", name)
+    return name
+def validate_patient_name(extracted_text, patient_name, filename, qa_pipeline):
+    detected_name = extract_patient_name(extracted_text, qa_pipeline)
+    if not detected_name:
+        return jsonify({"error": f"Could not determine patient name from {filename}"}), 400
+    normalized_detected_name = normalize_name(detected_name)
+    normalized_patient_name = normalize_name(patient_name)
+    if normalized_detected_name not in normalized_patient_name:
+        return jsonify({
+            "error": f"Document '{filename}' does not belong to {patient_name}. Found: {detected_name}"
+        }), 400
+    return None
+def extract_patient_name(text, qa_pipeline):
+    if not text or not qa_pipeline:
+        return None
+    try:
+        result = qa_pipeline(
+            question="What is the patient's name?",
+            context=text
+        )
+        return result.get("answer", "").strip()
+    except Exception as e:
+        return None

combined1.py ADDED Viewed

	@@ -0,0 +1,880 @@

+import json
+import os
+import re
+import logging
+from dotenv import load_dotenv
+from flask import Flask, request, jsonify, abort
+from werkzeug.utils import secure_filename
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+import pytesseract
+import cv2
+import pdfplumber
+import pandas as pd
+from PIL import Image
+from docx import Document
+from flask_cors import CORS
+from flask_executor import Executor
+from sentence_transformers import SentenceTransformer
+import faiss
+import whisper
+from PyPDF2 import PdfReader
+from pdf2image import convert_from_path
+from concurrent.futures import ThreadPoolExecutor
+import tempfile
+import tensorflow.keras.layers as KL  # Instead of keras.layers as KL
+import numpy as np
+# Load environment variables
+load_dotenv()
+# Set Tesseract OCR Path
+pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
+# Initialize Flask app
+app = Flask(__name__)
+CORS(app)
+# Configure logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+# Configure upload directory and max file size
+UPLOAD_DIR = os.getenv('UPLOAD_DIR', os.path.join(os.getcwd(), 'uploads'))
+os.makedirs(UPLOAD_DIR, exist_ok=True)
+app.config['UPLOAD_FOLDER'] = UPLOAD_DIR
+app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024  # 16 MB max file size
+# Initialize Flask-Executor for asynchronous tasks
+executor = Executor(app)
+whisper_model = whisper.load_model("tiny")
+# Allowed file extensions
+ALLOWED_AUDIO_EXTENSIONS = {'mp3', 'wav', 'flac'}
+ALLOWED_DOCUMENT_EXTENSIONS = {'pdf', 'jpg', 'jpeg', 'png', 'docx', 'xlsx', 'xls'}
+UPLOAD_FOLDER = 'Uploads'
+ALLOWED_EXTENSIONS = {'pdf', 'jpg', 'jpeg', 'png', 'svg', 'docx', 'doc'}
+app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
+# Set file size limits
+MAX_SIZE_PDF_DOCS = 1 * 1024 * 1024 * 1024  # 1GB
+MAX_SIZE_IMAGES = 500 * 1024 * 1024  # 500MB
+# Lazy model loading to save resources
+class LazyModelLoader:
+    def __init__(self, model_name, task, tokenizer=None):
+        self.model_name = model_name
+        self.task = task
+        self.tokenizer = tokenizer
+        self._model = None
+    def load(self):
+        """Load the model if not already loaded."""
+        if self._model is None:
+            logging.info(f"Loading model: {self.model_name}")
+            if self.task == "text-generation":
+                self._model = AutoModelForCausalLM.from_pretrained(
+                    self.model_name, device_map="auto", torch_dtype="auto"
+                )
+                self._tokenizer = AutoTokenizer.from_pretrained(self.model_name, legacy=False)
+                if self._model.generation_config.pad_token_id is None or self._model.generation_config.pad_token_id < 0:
+                    if self._tokenizer.eos_token_id is not None:
+                        self._model.generation_config.pad_token_id = self._tokenizer.eos_token_id
+                        logging.info(f"Set pad_token_id to {self._tokenizer.eos_token_id}")
+                    else:
+                        logging.warning("No valid eos_token_id found. Setting pad_token_id to 0 as a fallback.")
+                        self._model.generation_config.pad_token_id = 0
+            else:
+                self._model = pipeline(self.task, model=self.model_name, tokenizer=self.tokenizer)
+        return self._model
+# Text extraction agents
+class TextExtractorAgent:
+    @staticmethod
+    def extract_text(filepath, ext):
+        """Extract text based on file type."""
+        try:
+            if ext == "pdf":
+                return TextExtractorAgent.extract_text_from_pdf(filepath)
+            elif ext in {"jpg", "jpeg", "png"}:
+                return TextExtractorAgent.extract_text_from_image(filepath)
+            elif ext == "docx":
+                return TextExtractorAgent.extract_text_from_docx(filepath)
+            elif ext in {"xlsx", "xls"}:
+                return TextExtractorAgent.extract_text_from_excel(filepath)
+            return None
+        except Exception as e:
+            logging.error(f"Text extraction failed: {e}")
+            return None
+    @staticmethod
+    def extract_text_from_pdf(filepath):
+        """Extract text from a PDF file."""
+        text = ""
+        with pdfplumber.open(filepath) as pdf:
+            for page in pdf.pages:
+                page_text = page.extract_text()
+                if page_text:
+                    text += page_text + "\n"
+        return text.strip() or None
+    @staticmethod
+    def extract_text_from_image(filepath):
+        """Extract text from an image using OCR."""
+        image = cv2.imread(filepath)
+        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        _, processed = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
+            processed_path = temp_file.name
+        cv2.imwrite(processed_path, processed)
+        text = pytesseract.image_to_string(Image.open(processed_path), lang='eng')
+        os.remove(processed_path)
+        return text.strip() or None
+    @staticmethod
+    def extract_text_from_docx(filepath):
+        """Extract text from a DOCX file."""
+        doc = Document(filepath)
+        text = "\n".join([para.text for para in doc.paragraphs])
+        return text.strip() or None
+    @staticmethod
+    def extract_text_from_excel(filepath):
+        """Extract text from an Excel file."""
+        dfs = pd.read_excel(filepath, sheet_name=None)
+        text = "\n".join([
+            "\n".join([
+                " ".join(map(str, df[col].dropna()))
+                for col in df.columns
+            ])
+            for df in dfs.values()
+        ])
+        return text.strip() or None
+# PHI scrubbing agent
+class PHIScrubberAgent:
+    @staticmethod
+    def scrub_phi(text):
+        """Remove sensitive personal health information (PHI)."""
+        try:
+            text = re.sub(r'\b(?:\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}\b', '[PHONE]', text)
+            text = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w{2,4}\b', '[EMAIL]', text)
+            text = re.sub(r'\b\d{3}-\d{2}-\d{4}\b', '[SSN]', text)
+            text = re.sub(r'\b\d{1,5}\s+\w+\s+(Street|St|Avenue|Ave|Boulevard|Blvd|Road|Rd|Lane|Ln)\b', '[ADDRESS]', text, flags=re.IGNORECASE)
+            text = re.sub(r'\bDr\.?\s+[A-Z][a-z]+\s+[A-Z][a-z]+\b', 'Dr. [NAME]', text)
+            text = re.sub(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', '[NAME]', text)
+        except Exception as e:
+            logging.error(f"PHI scrubbing failed: {e}")
+        return text
+# Summarization agent
+class SummarizerAgent:
+    def __init__(self, summarization_model_loader):
+        self.summarization_model_loader = summarization_model_loader
+    def generate_summary(self, text):
+        """Generate a summary of the provided text."""
+        model = self.summarization_model_loader.load()
+        try:
+            summary_result = model(text, do_sample=False)
+            return summary_result[0]['summary_text'].strip()
+        except Exception as e:
+            logging.error(f"Summary generation failed: {e}")
+            return "Summary generation failed."
+def allowed_file(filename, allowed_extensions=ALLOWED_EXTENSIONS):
+    """Check if the file extension is allowed."""
+    return '.' in filename and filename.rsplit('.', 1)[1].lower() in allowed_extensions
+# Knowledge Base
+class KnowledgeBase:
+    def __init__(self, documents):
+        self.embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
+        self.documents = documents
+        self.embeddings = self.embedding_model.encode(documents)
+        self.dimension = self.embedding_model.get_sentence_embedding_dimension()
+        self.index = faiss.IndexFlatL2(self.dimension)
+        self.index.add(self.embeddings)
+    def retrieve_relevant_info(self, query, top_k=3):
+        """Retrieve relevant medical information from the knowledge base."""
+        query_embedding = self.embedding_model.encode([query])
+        distances, indices = self.index.search(query_embedding, top_k)
+        relevant_texts = [self.documents[i] for i in indices[0]]
+        return relevant_texts
+# Medical data extraction agent
+class MedicalDataExtractorAgent:
+    def __init__(self, model_loader, knowledge_base):
+        self.model_loader = model_loader
+        self.knowledge_base = knowledge_base
+    def retrieve_relevant_info(self, query, top_k=3):
+        """Retrieve relevant medical information from the knowledge base."""
+        query_embedding = self.knowledge_base.embedding_model.encode([query])
+        distances, indices = self.knowledge_base.index.search(query_embedding, top_k)
+        relevant_texts = [self.knowledge_base.documents[i] for i in indices[0]]
+        return relevant_texts
+    def extract_medical_data(self, text):
+        """Extract structured medical data from text using Agentic RAG."""
+        try:
+            default_schema = {
+                "patient_name": "[NAME]",
+                "age": None,
+                "gender": None,
+                "diagnosis": [],
+                "symptoms": [],
+                "medications": [],
+                "allergies": [],
+                "vitals": {
+                    "blood_pressure": None,
+                    "heart_rate": None,
+                    "temperature": None
+                },
+                "notes": ""
+            }
+            prompt = f"""
+            ### Instruction:
+            Extract structured medical data from the following text as a JSON whose parameters are enclosed in "" and without any \.
+            The JSON should include patientname, age, gender, medications, allergies, diagnosis, symptoms, vitals, and notes.
+            ### Text:
+            {text}
+            ### Response:
+            """
+            model = self.model_loader.load()
+            tokenizer = self.model_loader._tokenizer
+            inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
+            outputs = model.generate(
+                inputs.input_ids,
+                num_return_sequences=1,
+                temperature=0.7,
+                top_p=0.9,
+                do_sample=True
+            )
+            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+            logging.info(f"Model response: {response}")
+            json_start = response.find("{")
+            json_end = response.rfind("}") + 1
+            if json_start == -1 or json_end == -1:
+                raise ValueError("No JSON found in the model response.")
+            structured_data = json.loads(response[json_start:json_end])
+            normalized_data = self.normalize_json_output(structured_data, default_schema)
+            if normalized_data["vitals"]["blood_pressure"] and isinstance(normalized_data["vitals"]["blood_pressure"], str):
+                normalized_data["vitals"]["blood_pressure"] = normalized_data["vitals"]["blood_pressure"].strip('"')
+            return json.dumps(normalized_data)
+        except json.JSONDecodeError as e:
+            logging.error(f"JSON parsing failed: {e}")
+            return json.dumps({"error": f"Failed to parse JSON: {str(e)}"})
+        except Exception as e:
+            logging.error(f"Error extracting medical data: {e}")
+            return json.dumps({"error": f"Failed to extract medical data: {str(e)}"})
+    @staticmethod
+    def normalize_json_output(model_output, default_schema):
+        """Normalize the model's JSON output to match the default schema."""
+        try:
+            normalized_output = default_schema.copy()
+            for key in normalized_output:
+                if key in model_output:
+                    normalized_output[key] = model_output[key]
+            return normalized_output
+        except Exception as e:
+            logging.error(f"Failed to normalize JSON: {e}")
+            return default_schema
+# Initialize lazy loaders
+medalpaca_model_loader = LazyModelLoader(
+    model_name="stanford-crfm/BioMedLM",
+    task="text-generation"
+)
+summarization_model_loader = LazyModelLoader("google-t5/t5-small", "summarization")
+# Initialize knowledge base
+medical_documents = [
+    "Hypertension is a chronic condition characterized by elevated blood pressure.",
+    "Diabetes is a metabolic disorder that affects blood sugar levels.",
+    "Common symptoms of chest pain include pressure, tightness, or discomfort in the chest."
+]
+knowledge_base = KnowledgeBase(medical_documents)
+# Initialize agents
+text_extractor_agent = TextExtractorAgent()
+phi_scrubber_agent = PHIScrubberAgent()
+medical_data_extractor_agent = MedicalDataExtractorAgent(medalpaca_model_loader, knowledge_base)
+summarizer_agent = SummarizerAgent(summarization_model_loader)
+# NER to Detect medical info
+CONFIDENCE_THRESHOLD = 0.80
+def extract_medical_entities(text, ner_pipeline):
+    if not text or not text.strip():
+        return ["No medical entities found"]
+    if ner_pipeline is None:
+        print("⚠️ NER model is not loaded, skipping entity extraction.")
+        return ["No medical entities found"]
+    ner_results = ner_pipeline(text)
+    relevant_entities = {
+        "Disease", "MedicalCondition", "Symptom", "Sign_or_Symptom",
+        "B-DISEASE", "I-DISEASE",
+        "Test", "Measurement", "B-TEST", "I-TEST", "Lab_value", "B-Lab_value", "I-Lab_value",
+        "Medication", "B-MEDICATION", "I-MEDICATION", "Treatment",
+        "Procedure", "B-Diagnostic_procedure", "I-Diagnostic_procedure",
+        "Anatomical_site", "Body_Part", "Organ_or_Tissue",
+        "Diagnostic_procedure", "Surgical_Procedure", "Therapeutic_Procedure",
+        "Health_condition", "B-Health_condition", "I-Health_condition",
+        "Pathological_Condition", "Clinical_Event",
+        "Chemical_Substance", "B-Chemical_Substance", "I-Chemical_Substance",
+        "Biological_Entity", "B-Biological_Entity", "I-Biological_Entity"
+    }
+    medical_entities = set()
+    for ent in ner_results:
+        entity_label = ent.get("entity_group") or ent.get("entity")
+        if entity_label in relevant_entities and ent["score"] >= CONFIDENCE_THRESHOLD:
+            word = ent["word"].lower().strip().replace("-", "")
+            if len(word) > 2:
+                medical_entities.add(word)
+    if len(medical_entities) >= 5:
+        return list(medical_entities)
+    return ["No medical entities found"]
+# Validation: Check File Size
+def check_file_size(file):
+    file.seek(0, os.SEEK_END)
+    size = file.tell()
+    file.seek(0)
+    extension = file.filename.rsplit('.', 1)[-1].lower()
+    if extension in {'pdf', 'docx'} and size > MAX_SIZE_PDF_DOCS:
+        return False, f"File {file.filename} exceeds 1GB size limit"
+    elif extension in {'jpg', 'jpeg', 'png'} and size > MAX_SIZE_IMAGES:
+        return False, f"Image {file.filename} exceeds 500MB size limit"
+    return True, None
+def extract_patient_name(text, qa_pipeline):
+    """Extracts patient name using the given QA pipeline."""
+    if not text or not qa_pipeline:
+        return None
+    try:
+        result = qa_pipeline(
+            question="What is the patient's name?",
+            context=text
+        )
+        return result.get("answer", "").strip()
+    except Exception as e:
+        print(f"⚠️ Error extracting patient name: {e}")
+        return None
+def normalize_name(name):
+    """Cleans and normalizes names for comparison, removing salutations dynamically."""
+    if not name:
+        return ""
+    name = name.lower().strip()
+    name = re.sub(r"[^\w\s]", "", name)
+    name = re.sub(r"^\b\w{1,5}\b\s+", "", name)
+    return name
+def validate_patient_name(extracted_text, patient_name, filename, qa_pipeline):
+    """Validates if the extracted name matches the registered patient name."""
+    detected_name = extract_patient_name(extracted_text, qa_pipeline)
+    if not detected_name:
+        return jsonify({"error": f"Could not determine patient name from {filename}"}), 400
+    normalized_detected_name = normalize_name(detected_name)
+    normalized_patient_name = normalize_name(patient_name)
+    if normalized_detected_name not in normalized_patient_name:
+        return jsonify({
+            "error": f"Document '{filename}' does not belong to {patient_name}. Found: {detected_name}"
+        }), 400
+    return None
+def is_blurred(image_path, variance_threshold=150):
+    try:
+        image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
+        if image is None:
+            print(f"❌ Error: Unable to read image {image_path}")
+            return True
+        laplacian_var = cv2.Laplacian(image, cv2.CV_64F).var()
+        print(f"🔍 Blur Check: Variance={laplacian_var} (Threshold={variance_threshold})")
+        edges = cv2.Canny(image, 50, 150)
+        edge_density = np.mean(edges)
+        print(f"📏 Edge Density: {edge_density}")
+        return laplacian_var < variance_threshold and edge_density < 10
+    except Exception as e:
+        print(f"❌ Error detecting blur: {e}")
+        return True
+def extract_text_from_image(filepath):
+    try:
+        if is_blurred(filepath):
+            return "Image is too blurry, OCR failed."
+        image = cv2.imread(filepath)
+        if image is None:
+            return "Image could not be read."
+        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        gray = cv2.GaussianBlur(gray, (5, 5), 0)
+        gray = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+                                     cv2.THRESH_BINARY, 11, 2)
+        kernel = np.ones((2,2), np.uint8)
+        gray = cv2.dilate(gray, kernel, iterations=1)
+        processed_path = f"{filepath}_processed.png"
+        cv2.imwrite(processed_path, gray)
+        text = pytesseract.image_to_string(Image.open(processed_path), lang='eng').strip()
+        words = text.split()
+        if len(words) < 5:
+            return "OCR failed to extract meaningful text."
+        return text
+    except Exception as e:
+        print(f"❌ Error processing {filepath}: {e}")
+        return "Failed to extract text"
+def extract_text_from_pdf(filepath, password=None):
+    """Extract text from PDFs using pdfplumber (faster) or OCR (if needed)."""
+    text = ""
+    try:
+        reader = PdfReader(filepath)
+        if reader.is_encrypted:
+            if not password:
+                print("🔒 PDF is encrypted but no password was provided.")
+                return {"error": "File is password-protected. Please provide a password."}, 401
+            decryption_result = reader.decrypt(password)
+            if decryption_result == 0:
+                print("❌ Incorrect password provided!")
+                return {"error": "Invalid password provided."}, 403
+            else:
+                print("✅ PDF successfully decrypted!")
+            text = "\n".join([page.extract_text() or "" for page in reader.pages])
+            if text.strip():
+                return text.strip(), 200
+        with pdfplumber.open(filepath) as pdf:
+            for page in pdf.pages:
+                page_text = page.extract_text()
+                if page_text:
+                    text += page_text + "\n"
+        if text.strip():
+            return text.strip(), 200
+        images = convert_from_path(filepath)
+        with ThreadPoolExecutor(max_workers=5) as pool:
+            ocr_text = list(pool.map(lambda img: pytesseract.image_to_string(img, lang='eng'), images))
+        return ("\n".join(ocr_text).strip(), 200) if ocr_text else ("No text found", 415)
+    except Exception as e:
+        print(f"❌ Error processing PDF {filepath}: {e}")
+        return "Failed to extract text"
+def extract_text_from_docx(filepath):
+    doc = Document(filepath)
+    text = "\n".join([para.text for para in doc.paragraphs])
+    return text.strip() or None
+def clean_result(value):
+    value = re.sub(r"\s+", " ", value)
+    value = re.sub(r"[-_:]+", " ", value)
+    value = re.sub(r"[^\x00-\x7F]+", " ", value)
+    return value if value else "Not Available"
+def mask_sensitive_info(text):
+    text = re.sub(r'(?<=\b\w{2})\w+(?=\s\w{2,})', '***', text)
+    text = re.sub(r'\b(\d{2})\d{2}-(\d{2})\d{2}-(\d{2})\d{2}\b', r'**\2-**\3-**', text)
+    text = re.sub(r'\b(\d{8})(\d{2})\b', r'********\2', text)
+    return text
+# API Endpoints
+@app.route('/extract_medical_data', methods=['POST'])
+def extract_medical_data():
+    """Extract structured medical data from raw text."""
+    try:
+        data = request.json
+        if "text" not in data or not data["text"].strip():
+            return jsonify({"error": "No valid text provided"}), 400
+        raw_text = data["text"]
+        clean_text = phi_scrubber_agent.scrub_phi(raw_text)
+        structured_data = medical_data_extractor_agent.extract_medical_data(clean_text)
+        return jsonify(json.loads(structured_data)), 200
+    except Exception as e:
+        logging.error(f"Failed to extract medical data: {e}")
+        return jsonify({"error": f"Extraction Error: {str(e)}"}), 500
+@app.route('/api/transcribe', methods=['POST'])
+def transcribe_audio():
+    """Transcribe audio files into text."""
+    if 'audio' not in request.files:
+        abort(400, description="No audio file provided")
+    audio_file = request.files['audio']
+    if not allowed_file(audio_file.filename, ALLOWED_AUDIO_EXTENSIONS):
+        abort(400, description="Invalid file type. Only mp3, wav, and flac files are allowed.")
+    filename = secure_filename(audio_file.filename)
+    audio_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
+    audio_file.save(audio_path)
+    try:
+        result = whisper_model.transcribe(audio_path)
+        transcribed_text = result["text"]
+        os.remove(audio_path)
+        return jsonify({"transcribed_text": transcribed_text}), 200
+    except Exception as e:
+        logging.error(f"Transcription failed: {str(e)}")
+        return jsonify({"error": f"Transcription failed: {str(e)}"}), 500
+@app.route('/api/generate_summary', methods=['POST'])
+def generate_summary():
+    """Generate a summary from the provided text."""
+    data = request.json
+    if "text" not in data or not data["text"].strip():
+        return jsonify({"error": "No valid text provided"}), 400
+    context = data["text"]
+    clean_text = phi_scrubber_agent.scrub_phi(context)
+    summary = summarizer_agent.generate_summary(clean_text)
+    return jsonify({"summary": summary}), 200
+@app.route('/api/extract_medical_data_from_audio', methods=['POST'])
+def extract_medical_data_from_audio():
+    """Extract medical data from transcribed audio."""
+    if 'audio' not in request.files:
+        abort(400, description="No audio file provided")
+    audio_file = request.files['audio']
+    if not allowed_file(audio_file.filename, ALLOWED_AUDIO_EXTENSIONS):
+        abort(400, description="Invalid file type. Only mp3, wav, and flac files are allowed.")
+    logging.info(audio_file.filename)
+    logging.info(app.config['UPLOAD_FOLDER'])
+    filename = secure_filename(audio_file.filename)
+    logging.info(filename)
+    audio_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
+    logging.info(audio_path)
+    audio_file.save(audio_path)
+    try:
+        result = whisper_model.transcribe(audio_path)
+        transcribed_text = result["text"]
+        clean_text = phi_scrubber_agent.scrub_phi(transcribed_text)
+        summary = summarizer_agent.generate_summary(transcribed_text)
+        structured_data = medical_data_extractor_agent.extract_medical_data(transcribed_text)
+        response = {
+            "transcribed_text": transcribed_text,
+            "summary": summary,
+            "medical_chart": json.loads(structured_data)
+        }
+        os.remove(audio_path)
+        return jsonify(response), 200
+    except Exception as e:
+        logging.error(f"Processing failed: {str(e)}")
+        return jsonify({"error": f"Processing failed: {str(e)}"}), 500
+@app.route('/upload', methods=['POST'])
+def upload_file():
+    files = request.files.getlist("file")
+    patient_name = request.form.get("patient_name", "").strip()
+    password = request.form.get("password")
+    qa_model_name = request.form.get("qa_model_name")
+    qa_model_type = request.form.get("qa_model_type")
+    ner_model_name = request.form.get("ner_model_name")
+    ner_model_type = request.form.get("ner_model_type")
+    summarizer_model_name = request.form.get("summarizer_model_name")
+    summarizer_model_type = request.form.get("summarizer_model_type")
+    if not files:
+        return jsonify({"error": "No file uploaded"}), 400
+    try:
+        qa_pipeline = pipeline(task=qa_model_type, model=qa_model_name)
+        print(f"✅ QA Model Loaded: {qa_model_name}")
+    except Exception as e:
+        return jsonify({"error": f"QA model load failed: {str(e)}"}), 500
+    try:
+        ner_pipeline = pipeline(task=ner_model_type, model=ner_model_name)
+        print(f"✅ NER Model Loaded: {ner_model_name}")
+    except Exception as e:
+        return jsonify({"error": f"NER model load failed: {str(e)}"}), 500
+    try:
+        summarizer_pipeline = pipeline(task=summarizer_model_type, model=summarizer_model_name)
+        print(f"✅ Summarizer Model Loaded: {summarizer_model_name}")
+    except Exception as e:
+        return jsonify({"error": f"Summarizer model load failed: {str(e)}"}), 500
+    extracted_data = []
+    print(patient_name)
+    for file in files:
+        if file.filename == '':
+            continue
+        if not allowed_file(file.filename):
+            return jsonify({"error": f"Unsupported file type: {file.filename}. Supported file types are: {', '.join(ALLOWED_EXTENSIONS)}"}), 400
+        if not patient_name:
+            return jsonify({"error": "Patient name is missing"}), 400
+        valid_size, error_message = check_file_size(file)
+        if not valid_size:
+            return jsonify({"error": error_message}), 400
+        filename = secure_filename(file.filename)
+        filepath = os.path.join(UPLOAD_FOLDER, filename)
+        file.save(filepath)
+        extracted_text = None
+        if filename.endswith(".pdf"):
+            result = extract_text_from_pdf(filepath, password)
+            if isinstance(result, tuple):
+                extracted_text, status_code = result
+            else:
+                extracted_text = result
+                status_code = 200
+            if isinstance(extracted_text, dict) and "error" in extracted_text:
+                return jsonify(extracted_text), status_code
+        elif filename.endswith(".docx"):
+            extracted_text = extract_text_from_docx(filepath)
+        elif filename.endswith((".jpg", ".jpeg", ".png", ".svg")):
+            extracted_text = extract_text_from_image(filepath)
+        if not extracted_text or extracted_text == "No text found":
+            return jsonify({"error": f"Failed to extract text from {filename}"}), 415
+        if extracted_text in ["Image is too blurry, OCR failed.", "OCR failed to extract meaningful text."]:
+            return jsonify({"error": f"'{filename}' is too blurry or text is unreadable."}), 422
+        skip_medical_check = request.form.get("skip_medical_check", "false").lower() == "true"
+        if not skip_medical_check:
+            ner_results = ner_pipeline(extracted_text)
+            medical_entities = list(set([r["word"] for r in ner_results if r["entity"].startswith("B-") or r["entity"].startswith("I-")]))
+            print(f"Medical entities found: {medical_entities}")
+            if not medical_entities:
+                return jsonify({"error": f"'{filename}' is not medically relevant"}), 406
+        else:
+            print(f"Skipping Medical Validation for {filename}")
+        skip_patient_check = request.form.get("skip_patient_check", "false").lower() == "true"
+        if not skip_patient_check:
+            try:
+                error_response = validate_patient_name(extracted_text, patient_name, filename, qa_pipeline)
+                if error_response:
+                    return error_response
+            except Exception as e:
+                return jsonify({"error": f"Patient name validation failed: {str(e)}"}), 500
+        else:
+            print(f"Skipping Patient Name Validation for {filename}")
+        try:
+            summary = summarizer_pipeline(extracted_text, max_length=350, min_length=50, do_sample=False)[0]["summary_text"]
+        except Exception as e:
+            summary = "Summary failed"
+            print(f"⚠️ Error summarizing: {e}")
+        extracted_data.append({
+            "file": filename,
+            "extracted_text": extracted_text,
+            "summary": summary,
+            "message": "Successful"
+        })
+        extracted_text = None
+        summary = None
+    if not extracted_data:
+        return jsonify({"error": "No valid medical files processed"}), 400
+    return jsonify({"extracted_data": extracted_data}), 200
+@app.route('/extract_medical_data_questions', methods=['POST'])
+def extract_medical_data_questions():
+    """Extract medical data based on predefined questions."""
+    data = request.json
+    qa_model_name = data.get("qa_model_name")
+    qa_model_type = data.get("qa_model_type")
+    if "extracted_data" not in data:
+        return jsonify({"error": "Missing 'extracted_data' in request"}), 400
+    if not qa_model_name or not qa_model_type:
+        return jsonify({"error": "Missing 'model_name' or 'model_type'"}), 400
+    try:
+        print(f"🌀 Loading model: {qa_model_name} ({qa_model_type})")
+        qa_pipeline = pipeline(task=qa_model_type, model=qa_model_name)
+        loaded_model_name = qa_pipeline.model.config._name_or_path
+        loaded_model_type = qa_pipeline.task
+        print(f"✅ Model loaded: {loaded_model_name}")
+    except Exception as e:
+        print("❌ Error loading model:", str(e))
+        return jsonify({"error": f"Could not load model: {str(e)}"}), 500
+    questions = {
+        "Patient Name": "What is the patient's name?",
+        "Age": "What is the patient's age?",
+        "Gender": "What is the patient's gender?",
+        "Date of Birth": "What is the patient's date of birth?",
+        "Patient ID": "What is the patient ID?",
+        "Reason for Visit": "What is the reason for the patient's visit?",
+        "Physician": "Who is the physician in charge of the patient?",
+        "Test Date": "What is the test date?",
+        "Hemoglobin": "What is the patient's hemoglobin level?",
+        "Blood Glucose (Fasting)": "What is the patient's fasting blood glucose level?",
+        "Total Cholesterol": "What is the total cholesterol level?",
+        "LDL Cholesterol": "What is the LDL cholesterol level?",
+        "HDL Cholesterol": "What is the HDL cholesterol level?",
+        "Serum Creatinine": "What is the serum creatinine level?",
+        "Vitamin D (25-OH)": "What is the patient's Vitamin D level?",
+        "Height": "What is the patient's height?",
+        "Weight": "What is the patient's weight?",
+        "Blood Pressure (Systolic)": "What is the patient's systolic blood pressure?",
+        "Blood Pressure (Diastolic)": "What is the patient's diastolic blood pressure?",
+        "Recommendations": "What are the recommendations based on the test results?"
+    }
+    structured_response = {"extracted_data": []}
+    for file_data in data["extracted_data"]:
+        filename = file_data["file"]
+        context = file_data["extracted_text"]
+        if not context:
+            structured_response["extracted_data"].append({
+                "file": filename,
+                "medical_terms": "No data extracted",
+            })
+            continue
+        extracted_info = {}
+        for key, question in questions.items():
+            try:
+                result = qa_pipeline(question=question, context=context)
+                extracted_info[key] = clean_result(result.get("answer", "Not Available"))
+            except:
+                extracted_info[key] = "Error extracting"
+        categorized_data = [
+            {
+                "name": "Patient Information",
+                "fields": [
+                    {"label": "Patient Name", "value": extracted_info.get("Patient Name", "")},
+                    {"label": "Date of Birth", "value": extracted_info.get("Date of Birth", "")},
+                    {"label": "Gender", "value": extracted_info.get("Gender", "")},
+                    {"label": "Patient ID", "value": extracted_info.get("Patient ID", "")}
+                ]
+            },
+            {
+                "name": "Vitals",
+                "fields": [
+                    {"label": "Height", "value": extracted_info.get("Height", "")},
+                    {"label": "Weight", "value": extracted_info.get("Weight", "")},
+                    {"label": "Blood Pressure", "value": f"{extracted_info.get('Blood Pressure (Systolic)', '')}/{extracted_info.get('Blood Pressure (Diastolic)', '')} mmHg"},
+                    {"label": "Hemoglobin", "value": extracted_info.get("Hemoglobin", "")},
+                    {"label": "Serum Creatinine", "value": extracted_info.get("Serum Creatinine", "")}
+                ]
+            },
+            {
+                "name": "Lab Results",
+                "fields": [
+                    {"label": "Blood Glucose (Fasting)", "value": extracted_info.get("Blood Glucose (Fasting)", "")},
+                    {"label": "Total Cholesterol", "value": extracted_info.get("Total Cholesterol", "")},
+                    {"label": "LDL Cholesterol", "value": extracted_info.get("LDL Cholesterol", "")},
+                    {"label": "HDL Cholesterol", "value": extracted_info.get("HDL Cholesterol", "")},
+                    {"label": "Vitamin D (25-OH)", "value": extracted_info.get("Vitamin D (25-OH)", "")}
+                ]
+            },
+            {
+                "name": "Medical Notes",
+                "fields": [
+                    {"label": "Reason for Visit", "value": extracted_info.get("Reason for Visit", "")},
+                    {"label": "Physician", "value": extracted_info.get("Physician", "")},
+                    {"label": "Test Date", "value": extracted_info.get("Test Date", "")},
+                    {"label": "Recommendations", "value": extracted_info.get("Recommendations", "")}
+                ]
+            }
+        ]
+        structured_response["extracted_data"].append({
+            "file": filename,
+            "medical_terms": extracted_info,
+            "categorized_data": categorized_data,
+            "model_used": loaded_model_name,
+            "model_type": loaded_model_type
+        })
+        save_data_to_storage(filename, structured_response)
+        print(f"✅ Extracted data saved to: {os.path.join(UPLOAD_FOLDER, f'{filename}.json')}")
+    return jsonify(structured_response), 200
+def get_data_from_storage(filename):
+    try:
+        filepath = os.path.join(UPLOAD_FOLDER, f"{filename}.json")
+        print(f"🔍 Looking for file at: {filepath}")
+        if not os.path.exists(filepath):
+            print(f"🚫 File not found at: {filepath}")
+            return None
+        with open(filepath, "r") as file:
+            data = json.load(file)
+        print(f"✅ File found and loaded: {filepath}")
+        return data
+    except Exception as e:
+        print(f"🚨 Error loading data: {e}")
+        return None
+def save_data_to_storage(filename, data):
+    try:
+        filename = filename.rsplit(".", 1)[0]
+        filepath = os.path.join(UPLOAD_FOLDER, f"{filename}.json")
+        print(f"Saving to: {filepath}")
+        print(f"Directory exists: {os.path.exists(UPLOAD_FOLDER)}")
+        if not os.path.exists(UPLOAD_FOLDER):
+            print(f"Directory not found. Creating: {UPLOAD_FOLDER}")
+            os.makedirs(UPLOAD_FOLDER, exist_ok=True)
+        with open(filepath, "w") as file:
+            json.dump(data, file)
+            print(f"✅ Data saved successfully to {filepath}")
+    except Exception as e:
+        print(f"🚨 Exception during save: {e}")
+@app.route('/get_updated_medical_data', methods=['GET'])
+def get_updated_data():
+    file_name = request.args.get('file')
+    if not file_name:
+        return jsonify({"error": "File name is required"}), 400
+    file_name = file_name.rsplit(".", 1)[0]
+    updated_data = get_data_from_storage(file_name)
+    if updated_data:
+        return jsonify({"file": file_name, "data": updated_data}), 200
+    else:
+        return jsonify({"error": f"File '{file_name}' not found"}), 404
+@app.route('/update_medical_data', methods=['PUT'])
+def update_medical_data():
+    try:
+        data = request.json
+        print("Received data:", data)
+        filename = data.get("file")
+        filename = filename.rsplit(".", 1)[0]
+        updates = data.get("updates", [])
+        if not filename or not updates:
+            return jsonify({"error": "File name or updates missing"}), 400
+        existing_data = get_data_from_storage(filename)
+        if not existing_data:
+            return jsonify({"error": f"File '{filename}' not found"}), 404
+        for update in updates:
+            category = update.get("category")
+            field = update.get("field")
+            new_value = update.get("value")
+            updated = False
+            for cat in existing_data.get("extracted_data", []):
+                for categorized_data in cat.get("categorized_data", []):
+                    if categorized_data.get("name") == category:
+                        for fld in categorized_data.get("fields", []):
+                            if fld.get("label") == field:
+                                print(f"🔄 Updating {category} -> {field} from '{fld['value']}' to '{new_value}'")
+                                fld["value"] = new_value
+                                updated = True
+                                break
+                    if updated:
+                        break
+                if updated:
+                    break
+        save_data_to_storage(filename, existing_data)
+        print("✅ Updated data:", existing_data)
+        return jsonify({"message": "Data updated successfully", "updated_data": existing_data}), 200
+    except Exception as e:
+        print("❌ Error:", str(e))
+        return jsonify({"error": str(e)}), 500
+@app.route('/')
+def home():
+    return "Medical Data Extraction API is running!"
+if __name__ == '__main__':
+    app.run(host='0.0.0.0', port=5000, debug=True)

document_based_extraction.py ADDED Viewed

	@@ -0,0 +1,1188 @@

+import os, re, json
+import time, logging, functools
+import pytesseract
+import cv2
+import pdfplumber
+import numpy as np
+from PIL import Image
+from PyPDF2 import PdfReader
+from pdf2image import convert_from_path
+from flask import Flask, request, jsonify
+from flask_cors import CORS
+import torch
+from werkzeug.utils import secure_filename
+from docx import Document
+from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from collections import defaultdict
+from huggingface_hub import login
+# -------------------- Logging Config -------------------- #
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    handlers=[
+        logging.FileHandler("app.log"),
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
+# -------------------- Execution Time Decorator -------------------- #
+def log_execution_time(level=logging.INFO):
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            start_time = time.time()
+            try:
+                result = func(*args, **kwargs)
+                duration = time.time() - start_time
+                logger.log(level, f"⏱️ {func.__name__} executed in {duration:.6f} seconds")
+                return result
+            except Exception as e:
+                duration = time.time() - start_time
+                logger.exception(f"❌ Exception in {func.__name__} after {duration:.6f} seconds: {e}")
+                raise
+        return wrapper
+    return decorator
+login(
+    "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"
+)  # 🧠 This will store it and every model load will use it
+executor = ThreadPoolExecutor(max_workers=5)
+logger.info("Executor initialized with 5 workers")
+# Set Tesseract OCR Path
+# in Windows
+# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
+# in Linux
+pytesseract.pytesseract.tesseract_cmd = "/usr/local/bin/tesseract"
+# Set up Flask app
+app = Flask(__name__)
+CORS(app)
+UPLOAD_FOLDER = "uploads"
+ALLOWED_EXTENSIONS = {"pdf", "jpg", "jpeg", "png", "svg", "docx", "doc"}
+app.config["UPLOAD_FOLDER"] = UPLOAD_FOLDER
+# Set file size limits
+MAX_SIZE_PDF_DOCS = 1 * 1024 * 1024 * 1024  # *1GB*
+MAX_SIZE_IMAGES = 500 * 1024 * 1024  # *500MB*
+# # Load ClinicalBERT Model for Classification
+# try:
+#     zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
+#     print("✅ zero_shot_classifier Model Loaded Successfully")
+# except Exception as e:
+#     zero_shot_classifier = None
+#     print("❌ Error loading ClinicalBERT Model:", str(e))
+if not os.path.exists(UPLOAD_FOLDER):
+    os.makedirs(UPLOAD_FOLDER, exist_ok=True)
+#   NER to Detect medical info
+CONFIDENCE_THRESHOLD = 0.80
+@log_execution_time()
+def extract_medical_entities(text):
+    if not text or not text.strip():
+        return ["No medical entities found"]
+    if ner_pipeline is None:  # type: ignore
+        logger.warning("NER model is not loaded, skipping entity extraction.")
+        return ["No medical entities found"]
+    ner_results = ner_pipeline(text)  # type: ignore
+    relevant_entities = {
+        # Diseases & Symptoms
+        "Disease",
+        "MedicalCondition",
+        "Symptom",
+        "Sign_or_Symptom",
+        "B-DISEASE",
+        "I-DISEASE",
+        # Tests, Measurements, and Lab Values
+        "Test",
+        "Measurement",
+        "B-TEST",
+        "I-TEST",
+        "Lab_value",
+        "B-Lab_value",
+        "I-Lab_value",
+        # Medications, Treatments, and Procedures
+        "Medication",
+        "B-MEDICATION",
+        "I-MEDICATION",
+        "Treatment",
+        "Procedure",
+        "B-Diagnostic_procedure",
+        "I-Diagnostic_procedure",
+        # Body Parts & Medical Anatomy
+        "Anatomical_site",
+        "Body_Part",
+        "Organ_or_Tissue",
+        # Medical Procedures
+        "Diagnostic_procedure",
+        "Surgical_Procedure",
+        "Therapeutic_Procedure",
+        # Clinical Terms
+        "Health_condition",
+        "B-Health_condition",
+        "I-Health_condition",
+        "Pathological_Condition",
+        "Clinical_Event",
+        # Biological & Chemical Substances (Relevant to Lab Reports)
+        "Chemical_Substance",
+        "B-Chemical_Substance",
+        "I-Chemical_Substance",
+        "Biological_Entity",
+        "B-Biological_Entity",
+        "I-Biological_Entity",
+    }
+    medical_entities = set()
+    for ent in ner_results:
+        entity_label = ent.get("entity_group") or ent.get("entity")
+        if entity_label in relevant_entities and ent["score"] >= CONFIDENCE_THRESHOLD:
+            word = ent["word"].lower().strip().replace("-", "")  # Normalize text
+            if len(word) > 2:  # Ignore short/junk words
+                medical_entities.add(word)
+    if len(medical_entities) >= 5:
+        logger.info(f"Extracted {len(medical_entities)} medical entities")
+        return list(medical_entities)
+    logger.info("Not enough medical entities found")
+    return ["No medical entities found"]
+# Validation: Check Allowed File Types
+def allowed_file(filename):
+    return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
+# Validation: Check File Size
+def check_file_size(file):
+    file.seek(0, os.SEEK_END)
+    size = file.tell()
+    file.seek(0)
+    extension = file.filename.rsplit(".", 1)[-1].lower()
+    logger.info(f"Checking file size for '{file.filename}' - Size: {size} bytes")
+    if extension in {"pdf", "docx"} and size > MAX_SIZE_PDF_DOCS:
+        logger.warning(f"{file.filename} exceeds 1GB limit")
+        return False, f"File {file.filename} exceeds 1MB size limit"
+    elif extension in {"jpg", "jpeg", "png"} and size > MAX_SIZE_IMAGES:
+        logger.warning(f"{file.filename} exceeds 500MB image limit")
+        return False, f"Image {file.filename} exceeds 500KB size limit"
+    return True, None
+@log_execution_time()
+def extract_patient_name(text, qa_pipeline):
+    if not text or not qa_pipeline:
+        return None
+    try:
+        result = qa_pipeline(question="What is the patient's name?", context=text)
+        answer = result.get("answer", "").strip()
+        logger.info(f"Extracted patient name: {answer}")
+        return answer
+    except Exception as e:
+        logger.error(f"Error extracting patient name: {e}")
+        return None
+def normalize_name(name):
+    """Cleans and normalizes names for comparison, removing salutations dynamically"""
+    if not name:
+        return ""
+    name = name.lower().strip()
+    name = re.sub(r"[^\w\s]", "", name)
+    name = re.sub(r"^\b\w{1,5}\b\s+", "", name)  # Matches short words at the start
+    return name
+@log_execution_time()
+def validate_patient_name(extracted_text, patient_name, filename, qa_pipeline):
+    """Validates if the extracted name matches the registered patient name"""
+    detected_name = extract_patient_name(extracted_text, qa_pipeline)
+    if not detected_name:
+        logger.warning(f"Could not determine patient name from {filename}")
+        return (
+            jsonify({"error": f"Could not determine patient name from {filename}"}),
+            400,
+        )
+    normalized_detected_name = normalize_name(detected_name)
+    normalized_patient_name = normalize_name(patient_name)
+    if normalized_detected_name not in normalized_patient_name:
+        logger.warning(
+            f"Patient mismatch in file '{filename}': Found '{detected_name}'"
+        )
+        return (
+            jsonify(
+                {
+                    "error": f"Document '{filename}' does not belong to {patient_name}. Found: {detected_name}"
+                }
+            ),
+            400,
+        )
+    logger.info(f"Patient name validation passed for '{filename}'")
+    return None  # No error, validation passed
+# Check if the image is blurred using the Laplacian method
+def is_blurred(image_path, variance_threshold=150):
+    try:
+        image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
+        if image is None:
+            logger.error(f"Unable to read image: {image_path}")
+            return True  # Assume it's blurry if not readable
+        # Compute Laplacian variance
+        laplacian_var = cv2.Laplacian(image, cv2.CV_64F).var()
+        logger.info(
+            f"Blur Check on '{image_path}': Laplacian Variance = {laplacian_var:.2f} (Threshold = {variance_threshold})"
+        )
+        # Compute Edge Density (Additional Check)
+        edges = cv2.Canny(image, 50, 150)
+        edge_density = np.mean(edges)
+        logger.info(f"Edge Density for '{image_path}': {edge_density:.2f}")
+        is_blurry = laplacian_var < variance_threshold and edge_density < 10
+        if is_blurry:
+            logger.warning(f"Image '{image_path}' flagged as blurry.")
+        return is_blurry
+    except Exception as e:
+        logger.exception(f"Exception during blur detection for '{image_path}': {e}")
+        return True  # Assume it's blurry on failure
+# Helper Function: Extract Text from Images (OCR) with Blur Detection
+@log_execution_time()
+def extract_text_from_image(filepath):
+    try:
+        # Check if the image is blurry
+        if is_blurred(filepath):
+            logger.warning(f"OCR skipped: '{filepath}' is too blurry.")
+            return "Image is too blurry, OCR failed."
+        image = cv2.imread(filepath)
+        if image is None:
+            logger.error(f"OCR failed: Unable to read image '{filepath}'.")
+            return "Image could not be read."
+        # Convert to Grayscale and Apply Thresholding
+        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        gray = cv2.GaussianBlur(gray, (5, 5), 0)
+        gray = cv2.adaptiveThreshold(
+            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
+        )
+        # Apply dilation (bolds the text) for better OCR accuracy
+        kernel = np.ones((2, 2), np.uint8)
+        gray = cv2.dilate(gray, kernel, iterations=1)
+        processed_path = f"{filepath}_processed.png"
+        cv2.imwrite(processed_path, gray)
+        logger.info(f"Image preprocessed and saved: {processed_path}")
+        text = pytesseract.image_to_string(
+            Image.open(processed_path), lang="eng"
+        ).strip()
+        # Validate OCR output (Reject if too little text is extracted)
+        word_count = len(text.split())
+        logger.info(
+            f"OCR completed for '{filepath}' with {word_count} words extracted."
+        )
+        if word_count < 5:
+            logger.warning(f"OCR output too small for '{filepath}'. Might be junk.")
+            return "OCR failed to extract meaningful text."
+        return text
+    except Exception as e:
+        logger.exception(f"Error extracting text from image '{filepath}': {e}")
+        return "Failed to extract text"
+# Helper Function: Extract Text from PDF
+@log_execution_time()
+def extract_text_from_pdf(filepath, password=None):
+    """Extract text from PDFs using pdfplumber (faster) or OCR (if needed)."""
+    text = ""
+    try:
+        logger.info(f"Starting PDF extraction: {filepath}")
+        reader = PdfReader(filepath)
+        if reader.is_encrypted:
+            if not password:
+                logger.warning("Encrypted PDF without password.")
+                return {
+                    "error": "File is password-protected. Please provide a password."
+                }, 401
+            # ✅ Attempt to decrypt
+            decryption_result = reader.decrypt(password)
+            if decryption_result == 0:  # Decryption failed
+                logger.error("Incorrect password provided.")
+                return {"error": "Invalid password provided."}, 403
+            else:
+                logger.info("PDF decryption successful.")
+            text = "\n".join([page.extract_text() or "" for page in reader.pages])
+            if text.strip():
+                logger.info("Text extracted from decrypted PDF.")
+                return text.strip(), 200
+        # ✅ Now, use pdfplumber for text extraction
+        with pdfplumber.open(filepath) as pdf:
+            for page in pdf.pages:
+                page_text = page.extract_text()
+                if page_text:
+                    text += page_text + "\n"
+        if text.strip():
+            logger.info(
+                f"PDF text extracted using pdfplumber: {len(text.split())} words."
+            )
+            return text.strip(), 200  # ✅ Always return a tuple (text, status)
+        logger.info("No text found via pdfplumber. Falling back to OCR.")
+        # ✅ Use OCR if the PDF has no selectable text
+        images = convert_from_path(filepath)
+        with ThreadPoolExecutor(max_workers=5) as pool:
+            ocr_text = list(
+                pool.map(
+                    lambda img: pytesseract.image_to_string(img, lang="eng"), images
+                )
+            )
+        full_ocr_text = "\n".join(ocr_text).strip()
+        logger.info(
+            f"OCR fallback complete for PDF: {len(full_ocr_text.split())} words extracted."
+        )
+        return (full_ocr_text, 200) if full_ocr_text else ("No text found", 415)
+    except Exception as e:
+        logger.exception(f"Error during PDF processing: {filepath}")
+        return "Failed to extract text"
+# Helper Function: Extract Text from DOCX
+@log_execution_time()
+def extract_text_from_docx(filepath):
+    try:
+        doc = Document(filepath)
+        text = "\n".join([para.text for para in doc.paragraphs])
+        word_count = len(text.split())
+        logger.info(f"DOCX extracted from '{filepath}': {word_count} words.")
+        return text.strip() or None
+    except Exception as e:
+        logger.exception(f"Failed to extract text from DOCX: {filepath}")
+        return None
+# Masking function to hide sensitive data
+def mask_sensitive_info(text):
+    text = re.sub(r"(?<=\b\w{2})\w+(?=\s\w{2,})", "*", text)  # Mask names
+    text = re.sub(
+        r"\b(\d{2})\d{2}-(\d{2})\d{2}-(\d{2})\d{2}\b", r"\2-\3-", text
+    )  # Mask DOB
+    text = re.sub(r"\b(\d{8})(\d{2})\b", r"\2", text)  # Mask phone numbers
+    return text
+# ------------------Upload Documents ------------------ #
+# API Route: Upload File & Extract Text
+@app.route("/upload", methods=["POST"])
+@log_execution_time()
+def upload_file():
+    logger.info("📥 Upload request received")
+    files = request.files.getlist("file")
+    patient_name = request.form.get("patient_name", "").strip()
+    password = request.form.get("password")  # Get password if provided
+    # Dynamic model info from form
+    qa_model_name = request.form.get("qa_model_name")
+    qa_model_type = request.form.get("qa_model_type")
+    ner_model_name = request.form.get("ner_model_name")
+    ner_model_type = request.form.get("ner_model_type")
+    summarizer_model_name = request.form.get("summarizer_model_name")
+    summarizer_model_type = request.form.get("summarizer_model_type")
+    if not files:
+        logger.warning("No file uploaded")
+        return jsonify({"error": "No file uploaded"}), 400
+    # 🔌 Load models dynamically
+    try:
+        qa_pipeline = pipeline(task=qa_model_type, model=qa_model_name)
+        logger.info(f"✅ QA model loaded: {qa_model_name}")
+    except Exception as e:
+        logger.error(f"❌ QA model load failed: {e}")
+        return jsonify({"error": f"QA model load failed: {str(e)}"}), 500
+    try:
+        ner_pipeline = pipeline(task=ner_model_type, model=ner_model_name)
+        logger.info(f"✅ NER model loaded: {ner_model_name}")
+    except Exception as e:
+        logger.error(f"❌ NER model load failed: {e}")
+        return jsonify({"error": f"NER model load failed: {str(e)}"}), 500
+    try:
+        summarizer_pipeline = pipeline(
+            task=summarizer_model_type, model=summarizer_model_name
+        )
+        logger.info(f"✅ Summarizer model loaded: {summarizer_model_name}")
+    except Exception as e:
+        logger.error(f"❌ Summarizer model load failed: {e}")
+        return jsonify({"error": f"Summarizer model load failed: {str(e)}"}), 500
+    extracted_data = []
+    print(patient_name)
+    for file in files:
+        logger.info(f"📂 Processing file: {file.filename}")
+        if file.filename == "":
+            logger.warning("Skipping unnamed file")
+            continue  # Skip empty file names
+        if not allowed_file(file.filename):
+            logger.warning(f"Unsupported file type: {file.filename}")
+            return (
+                jsonify(
+                    {
+                        "error": f"Unsupported file type: {file.filename}. Supported file types are: {', '.join(ALLOWED_EXTENSIONS)}"
+                    }
+                ),
+                400,
+            )
+        if not patient_name:
+            logger.warning("Patient name missing")
+            return jsonify({"error": "Patient name is missing"}), 400
+        # *Check file size*
+        valid_size, error_message = check_file_size(file)
+        if not valid_size:
+            logger.warning(f"❌ File size validation failed: {error_message}")
+            return jsonify({"error": error_message}), 400
+        filename = secure_filename(file.filename)
+        filepath = os.path.join(UPLOAD_FOLDER, filename)
+        file.save(filepath)
+        logger.info(f"✅ File saved: {filepath}")
+        extracted_text = None
+        # ✅ *Extract text based on file type*
+        if filename.endswith(".pdf"):
+            logger.info("🧾 Extracting text from PDF")
+            result = extract_text_from_pdf(filepath, password)
+            # ✅ If PDF requires a password, return 401
+            if isinstance(result, tuple):
+                extracted_text, status_code = result
+            else:
+                extracted_text = result
+                status_code = 200
+            if isinstance(extracted_text, dict) and "error" in extracted_text:
+                logger.warning(f"⚠️ PDF extraction error: {extracted_text}")
+                return jsonify(extracted_text), status_code
+        elif filename.endswith(".docx"):
+            extracted_text = extract_text_from_docx(filepath)
+        elif filename.endswith((".jpg", ".jpeg", ".png", ".svg")):
+            logger.info("🖼️ Extracting text from image")
+            extracted_text = extract_text_from_image(filepath)
+        if not extracted_text or extracted_text == "No text found":
+            logger.warning(f"⚠️ No text extracted from {filename}")
+            return (
+                jsonify({"error": f"Failed to extract text from {filename}"}),
+                415,
+            )  # Unsupported Media Type
+        # reject blurred images
+        if extracted_text in [
+            "Image is too blurry, OCR failed.",
+            "OCR failed to extract meaningful text.",
+        ]:
+            logger.warning(f"🔍 OCR failed or image too blurry: {filename}")
+            return (
+                jsonify(
+                    {"error": f"'{filename}' is too blurry or text is unreadable."}
+                ),
+                422,
+            )  # Unprocessable Entity
+        # ✅ Medical Validation using NER
+        skip_medical_check = (
+            request.form.get("skip_medical_check", "false").lower() == "true"
+        )
+        if not skip_medical_check:
+            logger.info("🧠 Running NER medical validation")
+            start_time = time.time()
+            ner_results = ner_pipeline(extracted_text)
+            medical_entities = list(
+                set(
+                    [
+                        r["word"]
+                        for r in ner_results
+                        if r["entity"].startswith("B-") or r["entity"].startswith("I-")
+                    ]
+                )
+            )
+            elapsed_time = time.time() - start_time
+            logger.info(f"⏱️ Medical entity validation took {elapsed_time:.2f}s")
+            logger.info(f"🩺 Medical entities found: {medical_entities}")
+            if not medical_entities:
+                logger.warning(f"❌ No medical relevance in {filename}")
+                return (
+                    jsonify({"error": f"'{filename}' is not medically relevant"}),
+                    406,
+                )
+        else:
+            logger.info(f"⏭️ Skipping medical validation for {filename}")
+        # # ✅ Patient Name Validation using QA
+        # skip_patient_check = request.form.get("skip_patient_check", "false").lower() == "true"
+        # if not skip_patient_check:
+        #     try:
+        #         logger.info("🧍 Validating patient name")
+        #         start_time = time.time()
+        #         error_response = validate_patient_name(extracted_text, patient_name, filename,qa_pipeline)
+        #         elapsed_time = time.time() - start_time
+        #         logger.info(f"⏱️ Patient name validation took {elapsed_time:.2f}s")
+        #         if error_response:
+        #             return error_response
+        #     except Exception as e:
+        #         logger.error(f"❌ Patient name validation failed: {e}")
+        #         return jsonify({"error": f"Patient name validation failed: {str(e)}"}), 500
+        # else:
+        #     logger.info(f"⏭️ Skipping patient name validation for {filename}")
+        # ✨ Generate Summary using Summarizer
+        try:
+            logger.info("📝 Generating summary: %s", extracted_text)
+            start_time = time.time()
+            summary = summarizer_pipeline(
+                extracted_text, max_length=350, min_length=50, do_sample=False
+            )[0]["summary_text"]
+            elapsed_time = time.time() - start_time
+            logger.info(f"✅ Summary generated: {summary}")
+            logger.info(f"⏱️ Summary generation took {elapsed_time:.2f} seconds")
+        except Exception as e:
+            summary = "Summary failed"
+            logger.warning(f"⚠ Summary generation failed: {e}")
+        #  # Classify report type
+        # report_type = classify_medical_document(extracted_text)
+        # print(report_type)
+        # ✅ Summarize extracted text
+        extracted_data.append(
+            {
+                "file": filename,
+                # "document_type": report_type,
+                "extracted_text": extracted_text,
+                "summary": summary,
+                "message": "Successful",
+            }
+        )
+        logger.info(f"✅ Finished processing file: {filename}")
+    if not extracted_data:
+        logger.warning("❌ No valid medical files processed")
+        return jsonify({"error": "No valid medical files processed"}), 400
+    logger.info("📦 Upload processing completed successfully")
+    return jsonify({"extracted_data": extracted_data}), 200
+# # API Route: Extract Medical Data Based on Predefined Questions
+# @app.route('/extract_medical_data', methods=['POST'])
+# def extract_medical_data():
+#     data = request.json
+#     print(f"📥 Incoming request data: {data}")
+#     qa_model_name = data.get("qa_model_name")
+#     qa_model_type = data.get("qa_model_type")
+#     if "extracted_data" not in data:
+#         return jsonify({"error": "Missing 'extracted_data' in request"}), 400
+#     if not qa_model_name or not qa_model_type:
+#         return jsonify({"error": "Missing 'model_name' or 'model_type'"}), 400
+#     try:
+#         print(f"🌀 Loading model: {qa_model_name} ({qa_model_type})")
+#         qa_pipeline = pipeline(task=qa_model_type, model=qa_model_name)
+#         print(f"✅ Model loaded: {qa_pipeline.model.config._name_or_path}")
+#     except Exception as e:
+#         print("❌ Error loading model:", str(e))
+#         return jsonify({"error": f"Could not load model: {str(e)}"}), 500
+#     questions = {
+#         "Patient Name": "What is the patient's name?",
+#         "Age": "What is the patient's age?",
+#         "Gender": "What is the patient's gender?",
+#         "Date of Birth": "What is the patient's date of birth?",
+#         "Patient ID": "What is the patient ID?",
+#         "Reason for Visit": "What is the reason for the patient's visit?",
+#         "Physician": "Who is the physician in charge of the patient?",
+#         "Test Date": "What is the test date?",
+#         "Hemoglobin": "What is the patient's hemoglobin level?",
+#         "Blood Glucose (Fasting)": "What is the patient's fasting blood glucose level?",
+#         "Total Cholesterol": "What is the total cholesterol level?",
+#         "LDL Cholesterol": "What is the LDL cholesterol level?",
+#         "HDL Cholesterol": "What is the HDL cholesterol level?",
+#         "Serum Creatinine": "What is the serum creatinine level?",
+#         "Vitamin D (25-OH)": "What is the patient's Vitamin D level?",
+#         "Height": "What is the patient's height?",
+#         "Weight": "What is the patient's weight?",
+#         "Blood Pressure (Systolic)": "What is the patient's systolic blood pressure?",
+#         "Blood Pressure (Diastolic)": "What is the patient's diastolic blood pressure?",
+#         "Recommendations": "What are the recommendations based on the test results?"
+#     }
+#     structured_response = {"extracted_data": []}
+#     for file_data in data["extracted_data"]:
+#         filename = file_data["file"]
+#         context = file_data["extracted_text"]
+#         if not context:
+#             structured_response["extracted_data"].append({
+#                 "file": filename,
+#                 "medical_terms": "No data extracted"
+#             })
+#             continue
+#         # Prepare batch QA input
+#         qa_inputs = [
+#             {"question": q, "context": context}
+#             for q in questions.values()
+#         ]
+#         try:
+#             qa_outputs = qa_pipeline(qa_inputs)
+#             print("📤 Batch QA outputs:", qa_outputs)
+#         except Exception as e:
+#             print("⚠️ Batch failed, falling back to loop:", str(e))
+#             qa_outputs = [qa_pipeline(q) for q in qa_inputs]
+#         # Map answers back to questions
+#         extracted_info = {}
+#         for i, key in enumerate(questions.keys()):
+#             answer = qa_outputs[i].get("answer", "").strip()
+#             score = qa_outputs[i].get("score", 0.0)
+#             # If the model returns an empty string or very low confidence, mark as "Not Mentioned"
+#             if not answer or score < 0.1:
+#                 extracted_info[key] = "Not Mentioned"
+#             else:
+#                 extracted_info[key] = answer
+#         # Optional: Clean results
+#         # extracted_info = {k: clean_result(v) for k, v in extracted_info.items()}
+#         categorized_data = [
+#             {
+#                 "name": "Patient Information",
+#                 "fields": [
+#                     {"label": "Patient Name", "value": extracted_info.get("Patient Name", "")},
+#                     {"label": "Date of Birth", "value": extracted_info.get("Date of Birth", "")},
+#                     {"label": "Gender", "value": extracted_info.get("Gender", "")},
+#                     {"label": "Patient ID", "value": extracted_info.get("Patient ID", "")}
+#                 ]
+#             },
+#             {
+#                 "name": "Vitals",
+#                 "fields": [
+#                     {"label": "Height", "value": extracted_info.get("Height", "")},
+#                     {"label": "Weight", "value": extracted_info.get("Weight", "")},
+#                     {"label": "Blood Pressure", "value": f"{extracted_info.get('Blood Pressure (Systolic)', '')}/{extracted_info.get('Blood Pressure (Diastolic)', '')} mmHg"},
+#                     {"label": "Hemoglobin", "value": extracted_info.get("Hemoglobin", "")},
+#                     {"label": "Serum Creatinine", "value": extracted_info.get("Serum Creatinine", "")}
+#                 ]
+#             },
+#             {
+#                 "name": "Lab Results",
+#                 "fields": [
+#                     {"label": "Blood Glucose (Fasting)", "value": extracted_info.get("Blood Glucose (Fasting)", "")},
+#                     {"label": "Total Cholesterol", "value": extracted_info.get("Total Cholesterol", "")},
+#                     {"label": "LDL Cholesterol", "value": extracted_info.get("LDL Cholesterol", "")},
+#                     {"label": "HDL Cholesterol", "value": extracted_info.get("HDL Cholesterol", "")},
+#                     {"label": "Vitamin D (25-OH)", "value": extracted_info.get("Vitamin D (25-OH)", "")}
+#                 ]
+#             },
+#             {
+#                 "name": "Medical Notes",
+#                 "fields": [
+#                     {"label": "Reason for Visit", "value": extracted_info.get("Reason for Visit", "")},
+#                     {"label": "Physician", "value": extracted_info.get("Physician", "")},
+#                     {"label": "Test Date", "value": extracted_info.get("Test Date", "")},
+#                     {"label": "Recommendations", "value": extracted_info.get("Recommendations", "")}
+#                 ]
+#             }
+#         ]
+#         structured_response["extracted_data"].append({
+#             "file": filename,
+#             "medical_terms": extracted_info,
+#             "categorized_data": categorized_data
+#         })
+#         save_data_to_storage(filename, structured_response)
+#         print(f"✅ Extracted data saved to: {os.path.join(UPLOAD_FOLDER, f'{filename}.json')}")
+#     return jsonify(structured_response)
+# ------------------ CLEAN FUNCTION  ------------------ #
+@log_execution_time()
+def clean_result(value):
+    logger.debug("Cleaning value: %s", value)
+    if isinstance(value, str):
+        value = re.sub(r"\s+", " ", value)
+        value = re.sub(r"[-_:]+", " ", value)
+        value = re.sub(r"[^\x00-\x7F]+", " ", value)
+        value = re.sub(
+            r"(?<=\d),(?=\d)", "", value
+        )  # Remove commas in numbers like 250,000
+        return value.strip() if value.strip() else "Not Available"
+    elif isinstance(value, list):
+        cleaned = [clean_result(v) for v in value if v is not None]
+        return cleaned if cleaned else ["Not Available"]
+    elif isinstance(value, dict):
+        return {k: clean_result(v) for k, v in value.items()}
+    return value
+# ------------------Group by Category ------------------ #
+@log_execution_time()
+def group_by_category(data):
+    logger.info("Grouping extracted items by category")
+    grouped = defaultdict(list)
+    category_times = {}
+    for item in data:
+        cat = item.get("category", "General")
+        start_time = time.time()
+        grouped[cat].append(
+            {
+                "question": item.get("question", "Not Created"),
+                "label": item.get("label", "Unknown"),
+                "answer": item.get("answer", "Not Available"),
+            }
+        )
+        elapsed = time.time() - start_time
+        category_times[cat] = category_times.get(cat, 0) + elapsed
+    for cat, details in grouped.items():
+        logger.info(f"📂 Category '{cat}': {len(details)} items, time taken: {category_times[cat]:.4f}s")
+    return [{"category": k, "detail": v} for k, v in grouped.items()]
+# ------------------detect duplicate to remove it  ------------------ #
+@log_execution_time()
+def deduplicate_extractions(data):
+    logger.info("Deduplicating extracted data")
+    seen = set()
+    unique = []
+    for item in data:
+        # Use a tuple of key fields to detect duplicates
+        key = (item.get("label"))
+        if key not in seen:
+            seen.add(key)
+            unique.append(item)
+    return unique
+# Load tokenizer outside the route for performance
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B")
+# -----------------------------Split text into overlapping chunks---------------#
+@log_execution_time()
+def chunk_text(text, tokenizer, max_tokens=512, overlap=50):
+    """
+    Splits text into overlapping token-based chunks without using NLTK.
+    Args:
+        text (str): Raw input text.
+        tokenizer (transformers tokenizer): Hugging Face tokenizer instance.
+        max_tokens (int): Max tokens per chunk.
+        overlap (int): Number of overlapping tokens between chunks.
+    Returns:
+        List[str]: List of decoded text chunks.
+    """
+    # Tokenize the full text
+    logger.info("Splitting text into chunks")
+    input_ids = tokenizer.encode(text, add_special_tokens=False)
+    chunks = []
+    start = 0
+    while start < len(input_ids):
+        end = start + max_tokens
+        chunk_ids = input_ids[start:end]
+        chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
+        # Ensure partial continuation isn't cut off mid-sentence
+        if not chunk_text.endswith(('.', '?', '!', ':')):
+            chunk_text += "..."
+        chunks.append(chunk_text)
+        start += max_tokens - overlap
+    logger.info("Created %d chunks", len(chunks))
+    return chunks
+    # ------------------ PARSE JSON OBJECTS FROM OUTPUT ------------------ #
+@log_execution_time()
+def extract_json_objects(text):
+    logger.info("Extracting JSON objects from text")
+    extracted = []
+    try:
+        json_start = text.index('[')
+        json_text = text[json_start:]
+    except ValueError:
+        logger.warning("⚠ '[' not found in output")
+        return []
+    # Try parsing full array first
+    try:
+        parsed = json.loads(json_text)
+        if isinstance(parsed, list):
+            return parsed
+    except Exception:
+        pass  # fallback to manual parsing
+    # Manual recovery via brace matching
+    stack = 0
+    obj_start = None
+    for i, char in enumerate(json_text):
+        if char == '{':
+            if stack == 0:
+                obj_start = i
+            stack += 1
+        elif char == '}':
+            stack -= 1
+            if stack == 0 and obj_start is not None:
+                obj_str = json_text[obj_start:i+1]
+                try:
+                    obj = json.loads(obj_str)
+                    extracted.append(obj)
+                except Exception as e:
+                    logger.error(f"❌ Invalid JSON object: {e}")
+                obj_start = None
+    return extracted
+# ------------------ PROCESS A SINGLE CHUNK ------------------ #
+@log_execution_time()
+def process_chunk(generator, chunk, idx):
+    logger.info("Processing chunk %d", idx + 1)
+    prompt = f"""
+            [INST] <<SYS>>
+            You are a clinical data extraction assistant.
+            Your job is to:
+            1. Read the following medical report.
+            2. Extract all medically relevant facts as a list of JSON objects.
+            3. Each object must include:
+            - "label": a short field name (e.g., "blood pressure", "diagnosis")
+            - "question": a question related to that field
+            - "answer": the answer from the text
+            4. After extracting the list, categorize each object under one of the following fixed categories:
+            - Patient Info
+            - Vitals
+            - Symptoms
+            - Allergies
+            - Habits
+            - Comorbidities
+            - Diagnosis
+            - Medication
+            - Laboratory
+            - Radiology
+            - Doctor Note
+             Example format for structure only — do not include in output:
+            [
+            {{
+                "label": "patient name",
+                "question": "What is the patient's name?",
+                "answer": "John Doe",
+                "category": "Patient Info"
+            }},
+            {{
+                "label": "heart rate",
+                "question": "What is the heart rate?",
+                "answer": "78 bpm",
+                "category": "Vitals"
+            }}
+            ]
+            ⚠ Use these categories listed above.If an item does not fit any of these categories, create a new category for it.
+            Text:
+            {chunk}
+            Return a single valid JSON array of all extracted objects.
+            Do not include any explanations or commentary.
+            Only output the JSON array
+            <</SYS>> [/INST]
+            """
+    try:
+        output = generator(
+            prompt,
+            max_new_tokens=1024,
+            do_sample=True,
+            temperature=0.3
+        )[0]["generated_text"]
+        print("----------------------------------")
+        logger.info(f"📤 Output from chunk {idx}: {output}...")
+        return idx, output
+    except Exception as e:
+        logger.error("Error processing chunk %d: %s", idx, e)
+        return idx, None
+# ------------------Extract Medical Data ------------------ #
+@app.route("/extract_medical_data", methods=["POST"])
+@log_execution_time()
+def extract_medical_data():
+    data = request.json
+    logger.info("Received request: %s", json.dumps(data, indent=2))
+    qa_model_name = data.get("qa_model_name")
+    qa_model_type = data.get("qa_model_type")
+    extracted_files = data.get("extracted_data")
+    if not qa_model_name or not qa_model_type:
+        return jsonify({"error": "Missing 'qa_model_name' or 'qa_model_type'"}), 400
+    if not extracted_files:
+        return jsonify({"error": "Missing 'extracted_data' in request"}), 400
+    try:
+        logger.info(f"🌀 Loading model: {qa_model_name} ({qa_model_type})")
+        model = AutoModelForCausalLM.from_pretrained(qa_model_name,  device_map="auto", torch_dtype=torch.float16, trust_remote_code=True)
+        generator = pipeline(task=qa_model_type, model=model, tokenizer=tokenizer)
+        logger.info(f"✅ Model loaded successfully: {generator.model.config._name_or_path}")
+    except Exception as e:
+        logger.error("❌ Model load failure")
+        return jsonify({"error": f"Could not load model: {str(e)}"}), 500
+    structured_response = {"extracted_data": []}
+    for file_data in extracted_files:
+        filename = file_data.get("file", "unknown_file")
+        context = file_data.get("extracted_text", "").strip()
+        logger.info("Processing file: %s", filename)
+        if not context:
+            logger.warning("No text found in file: %s", filename)
+            structured_response["extracted_data"].append(
+                {"file": filename, "medical_fields": "No data extracted"}
+            )
+            continue
+        chunks = chunk_text(context, tokenizer)
+        logger.info(f"📚 Chunked into {len(chunks)} parts for {filename}")
+        all_extracted = []
+        # for idx,chunk in enumerate(chunks):
+        #  print(f"Processing chunk {idx+1}/{len(chunks)}")
+        with ThreadPoolExecutor(max_workers=4) as executor:
+            futures = {
+                executor.submit(process_chunk, generator, chunk, idx): idx
+                for idx, chunk in enumerate(chunks)
+            }
+            for future in as_completed(futures):
+                idx = futures[future]
+                _, output = future.result()
+                if not output:
+                    continue
+                try:
+                    objs = extract_json_objects(output)
+                    if objs:
+                        all_extracted.extend(objs)
+                    else:
+                        logger.error(f"⚠ Chunk {idx+1} yielded no valid JSON.")
+                except Exception as e:
+                    logger.error(f"❌ Error extracting JSON from chunk {idx+1}")
+        # Clean and group results for this file
+        if all_extracted:
+            deduped = deduplicate_extractions(all_extracted)
+            # cleaned_json = clean_result()
+            grouped_data = group_by_category(deduped)
+        else:
+            grouped_data = {"error": "No valid data extracted"}
+        structured_response["extracted_data"].append(
+            {"file": filename, "medical_fields": grouped_data}
+        )
+        try:
+            save_data_to_storage(filename, grouped_data)
+        except Exception as e:
+            logger.error(f"⚠ Failed to save data for {filename}: {e}")
+    logger.info("✅ Extraction complete.")
+    return jsonify(structured_response)
+# -------------------------- save data to a JSON file----------------------#
+@log_execution_time()
+def save_data_to_storage(filename, data):
+    try:
+        filename = filename.rsplit(".", 1)[0]  # Remove extension
+        filepath = os.path.join(UPLOAD_FOLDER, f"{filename}.json")
+        logger.info(f"💾 Saving to: {filepath}")
+        with open(filepath, "w") as file:
+            json.dump(data, file)
+        logger.info(f"✅ Data saved successfully to {filepath}")
+    except Exception as e:
+        logger.error(f"🚨 Exception during save: {e}")
+# Function to get data from a JSON file
+# 🔍 Get data from storage
+@log_execution_time()
+def get_data_from_storage(filename):
+    try:
+        filepath = os.path.join(UPLOAD_FOLDER, f"{filename}.json")
+        logger.info(f"🔍 Looking for file at: {filepath}")
+        if not os.path.exists(filepath):
+            logger.warning(f"🚫 File not found at: {filepath}")
+            return None
+        with open(filepath, "r") as file:
+            data = json.load(file)
+        logger.info(f"✅ File found and loaded: {filepath}")
+        return data
+    except Exception as e:
+        logger.error(f"🚨 Error loading data: {e}")
+        return None
+# 🔹 Fetch updated medical data
+@app.route("/get_updated_medical_data", methods=["GET"])
+@log_execution_time()
+def get_updated_data():
+    file_name = request.args.get("file")
+    if not file_name:
+        return jsonify({"error": "File name is required"}), 400
+    # 🔥 Strip extension if present
+    file_name = file_name.rsplit(".", 1)[0]
+    # ✅ Load updated JSON data from storage
+    updated_data = get_data_from_storage(file_name)
+    if updated_data:
+        return jsonify({"file": file_name, "data": updated_data}), 200
+    else:
+        return jsonify({"error": f"File '{file_name}' not found"}), 404
+@app.route("/update_medical_data", methods=["PUT"])
+@log_execution_time()
+def update_medical_data():
+    try:
+        data = request.json
+        logger.info("Received update: %s", json.dumps(data, indent=2))
+        filename = data.get("file", "").rsplit(".", 1)[0]  # Strip extension like .pdf
+        updates = data.get("updates", [])
+        if not filename or not updates:
+            return jsonify({"error": "File name or updates missing"}), 400
+        # Load current stored data
+        existing_data = get_data_from_storage(filename)
+        if not existing_data:
+            return jsonify({"error": f"File '{filename}' not found"}), 404
+        # Loop through updates and modify categorized_data
+        for update in updates:
+            category = update.get("category")
+            field = update.get("field")
+            new_value = update.get("value")
+            updated = False
+            for extracted in existing_data.get("extracted_data", []):
+                for cat in extracted.get("categorized_data", []):
+                    if cat.get("name") == category:
+                        for fld in cat.get("fields", []):
+                            if fld.get("label") == field:
+                                logger.info("Updating [%s] %s → %s", category, field, new_value)
+                                fld["value"] = new_value
+                                updated = True
+                                break
+                    if updated:
+                        break
+                if updated:
+                    break
+        # 🧠 Sync medical_terms with categorized_data
+        for extracted in existing_data.get("extracted_data", []):
+            if "categorized_data" in extracted:
+                new_terms = {}
+                for category in extracted["categorized_data"]:
+                    for field in category.get("fields", []):
+                        label = field.get("label")
+                        value = field.get("value", "")
+                        new_terms[label] = value
+                extracted["medical_terms"] = new_terms
+                logger.info("Synced 'medical_terms' with 'categorized_data'")
+        # Save updated data to file
+        save_data_to_storage(filename, existing_data)
+        logger.info("✅ Updated data saved successfully")
+        return (
+            jsonify(
+                {"message": "Data updated successfully", "updated_data": existing_data}
+            ),
+            200,
+        )
+    except Exception as e:
+        logger.error("Update error: %s", e)
+        return jsonify({"error": str(e)}), 500
+# Test Route
+@app.route("/")
+def home():
+    return "Medical Data Extraction API is running!"
+if __name__ == "__main__":
+    app.run(host="0.0.0.0", port=5000, debug=True)
+# if __name__ == '__main__':
+#     from gevent.pywsgi import WSGIServer # type: ignore
+#     http_server = WSGIServer(('0.0.0.0', 5000), app)
+#     http_server.serve_forever()

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+flask
+flask-cors
+werkzeug
+transformers
+whisper
+python-dotenv
+torch==2.6.0
+pillow
+pdf2image
+python-docx
+openpyxl
+pytesseract
+scikit-learn
+scipy
+pandas
+numpy

speech_to_chart.py ADDED Viewed

	@@ -0,0 +1,638 @@

+import json
+import os
+import re
+import logging
+import shutil
+from flask import Flask, request, jsonify, abort
+from werkzeug.utils import secure_filename
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+import torch
+import whisper
+from dotenv import load_dotenv
+import pytesseract
+import cv2
+import pdfplumber
+import pandas as pd
+from PIL import Image
+from docx import Document
+from flask_cors import CORS
+# Load environment variables
+load_dotenv()
+# Initialize Flask app
+app = Flask(__name__)
+CORS(app)
+# Configure logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+# Configure upload directory and max file size
+UPLOAD_DIR = os.getenv('UPLOAD_DIR', os.path.join(os.getcwd(), 'uploads'))
+os.makedirs(UPLOAD_DIR, exist_ok=True)
+app.config['UPLOAD_FOLDER'] = UPLOAD_DIR
+app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024  # 16 MB max file size
+# Allowed file extensions
+ALLOWED_AUDIO_EXTENSIONS = {'mp3', 'wav', 'flac'}
+ALLOWED_DOCUMENT_EXTENSIONS = {'pdf', 'jpg', 'jpeg', 'png', 'docx', 'xlsx', 'xls'}
+# Ensure ffmpeg is in PATH
+ffmpeg_path = shutil.which("ffmpeg") or "C:\\ffmpeg\\bin\\ffmpeg.exe"
+if not os.path.exists(ffmpeg_path):
+    raise RuntimeError("FFmpeg not found! Please install FFmpeg and set the correct path.")
+os.environ["PATH"] += os.pathsep + os.path.dirname(ffmpeg_path)\
+def allowed_file(filename, allowed_extensions):
+    """Check if the file extension is allowed."""
+    return '.' in filename and filename.rsplit('.', 1)[1].lower() in allowed_extensions
+class LazyModelLoader:
+    def __init__(self, model_name, task, tokenizer=None, apply_quantization=False):
+        self.model_name = model_name
+        self.task = task
+        self.tokenizer = tokenizer
+        self.apply_quantization = apply_quantization
+        self._pipeline = None
+    def load(self):
+        if self._pipeline is None:
+            logging.info(f"Loading pipeline for task: {self.task} | model: {self.model_name}")
+            if self.task == "question-answering":
+                model = AutoModelForCausalLM.from_pretrained(self.model_name)
+                tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+                if self.apply_quantization:
+                    logging.info("Applying quantization...")
+                    model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
+                self._pipeline = pipeline(self.task, model=model, tokenizer=tokenizer)
+            else:
+                self._pipeline = pipeline(self.task, model=self.model_name, tokenizer=self.tokenizer)
+        return self._pipeline
+# PHI scrubbing agent
+class PHIScrubberAgent:
+    @staticmethod
+    def scrub_phi(text):
+        try:
+            text = re.sub(r'\b(?:\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}\b', '[PHONE]', text)
+            text = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w{2,4}\b', '[EMAIL]', text)
+            text = re.sub(r'\b\d{3}-\d{2}-\d{4}\b', '[SSN]', text)
+            text = re.sub(r'\b\d{1,5}\s+\w+\s+(Street|St|Avenue|Ave|Boulevard|Blvd|Road|Rd|Lane|Ln)\b', '[ADDRESS]',
+                          text, flags=re.IGNORECASE)
+            text = re.sub(r'\bDr\.?\s+[A-Z][a-z]+\s+[A-Z][a-z]+\b', 'Dr. [NAME]', text)
+            text = re.sub(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', '[NAME]', text)
+        except Exception as e:
+            logging.error(f"PHI scrubbing failed: {e}")
+        return text
+# Summarization Agent
+class SummarizerAgent:
+    def __init__(self, summarization_model_loader):
+        self.summarization_model_loader = summarization_model_loader
+    def generate_summary(self, text):
+        model = self.summarization_model_loader.load()
+        try:
+            summary_result = model(text, max_length=150, min_length=30, do_sample=False)
+            return summary_result[0]['summary_text'].strip()
+        except Exception as e:
+            logging.error(f"Summary generation failed: {e}")
+            return "Summary generation failed."
+# Medical Data Extraction Agent
+class MedicalDataExtractorAgent:
+    def __init__(self, gen_model_loader):
+        self.gen_model_loader = gen_model_loader
+    def extract_medical_data(self, text):
+        try:
+            generator = self.gen_model_loader.load()
+            prompt = (
+                "Extract structured medical information from the following clinical note.\n\n"
+                "Return the result in JSON format with the following fields:\n"
+                "patient_condition, symptoms, current_problems, allergies, dr_notes, "
+                "prescription, investigations, follow_up_instructions.\n\n"
+                f"Clinical Note:\n{text}\n\n"
+                "Structured JSON Output:\n"
+            )
+            response = generator(prompt, max_new_tokens=256)[0]["generated_text"]
+            logging.debug(f"Raw model output: {response}")
+            json_start = response.find("{")
+            json_end = response.rfind("}") + 1
+            if json_start == -1 or json_end == -1:
+                raise ValueError("No JSON found in the model response.")
+            json_str = response[json_start:json_end]
+            return json.loads(json_str)
+        except Exception as e:
+            logging.error(f"Error extracting medical data: {e}")
+            return {"error": f"Failed to extract medical data: {str(e)}"}
+# Initialize lazy loaders
+gen_model_loader = LazyModelLoader(
+    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "text-generation",
+)
+summarization_model_loader = LazyModelLoader("google-t5/t5-large", "summarization", apply_quantization=True)
+whisper_model = whisper.load_model("base")
+# Initialize agents
+phi_scrubber_agent = PHIScrubberAgent()
+medical_data_extractor_agent = MedicalDataExtractorAgent(gen_model_loader)
+summarizer_agent = SummarizerAgent(summarization_model_loader)
+# API Endpoints
+@app.route('/api/extract_medical_data', methods=['POST'])
+def extract_medical_data():
+    try:
+        data = request.json
+        if "text" not in data or not data["text"].strip():
+            return jsonify({"error": "No valid text provided"}), 400
+        raw_text = data["text"]
+        clean_text = phi_scrubber_agent.scrub_phi(raw_text)
+        structured_data = medical_data_extractor_agent.extract_medical_data(clean_text)
+        return jsonify(structured_data), 200
+    except Exception as e:
+        logging.error(f"Failed to extract medical data: {e}")
+        return jsonify({"error": f"Extraction Error: {str(e)}"}), 500
+@app.route('/api/transcribe', methods=['POST'])
+def transcribe_audio():
+    if 'audio' not in request.files:
+        abort(400, description="No audio file provided")
+    audio_file = request.files['audio']
+    if not allowed_file(audio_file.filename, ALLOWED_AUDIO_EXTENSIONS):
+        abort(400, description="Invalid file type. Only mp3, wav, and flac files are allowed.")
+    filename = secure_filename(audio_file.filename)
+    audio_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
+    audio_file.save(audio_path)
+    try:
+        result = whisper_model.transcribe(audio_path)
+        transcribed_text = result["text"]
+        os.remove(audio_path)
+        return jsonify({"transcribed_text": transcribed_text}), 200
+    except Exception as e:
+        logging.error(f"Transcription failed: {str(e)}")
+        return jsonify({"error": f"Transcription failed: {str(e)}"}), 500
+@app.route('/api/generate_summary', methods=['POST'])
+def generate_summary():
+    data = request.json
+    if "text" not in data or not data["text"].strip():
+        return jsonify({"error": "No valid text provided"}), 400
+    context = data["text"]
+    clean_text = phi_scrubber_agent.scrub_phi(context)
+    summary = summarizer_agent.generate_summary(clean_text)
+    return jsonify({"summary": summary}), 200
+@app.route('/api/extract_medical_data_from_audio', methods=['POST'])
+def extract_medical_data_from_audio():
+    if 'audio' not in request.files:
+        abort(400, description="No audio file provided")
+    audio_file = request.files['audio']
+    if not allowed_file(audio_file.filename, ALLOWED_AUDIO_EXTENSIONS):
+        abort(400, description="Invalid file type. Only mp3, wav, and flac files are allowed.")
+    filename = secure_filename(audio_file.filename)
+    audio_path = os.path.join(UPLOAD_DIR, filename)
+    audio_file.save(audio_path)
+    try:
+        result = whisper_model.transcribe(audio_path)
+        transcribed_text = result["text"]
+        clean_text = phi_scrubber_agent.scrub_phi(transcribed_text)
+        summary = summarizer_agent.generate_summary(clean_text)
+        structured_data = medical_data_extractor_agent.extract_medical_data(clean_text)
+        response = {
+            "transcribed_text": clean_text,
+            "summary": summary,
+            "medical_chart": structured_data
+        }
+        os.remove(audio_path)
+        return jsonify(response), 200
+    except Exception as e:
+        logging.error(f"Processing failed: {str(e)}")
+        return jsonify({"error": f"Processing failed: {str(e)}"}), 500
+if __name__ == '__main__':
+    app.run(host='0.0.0.0', port=5000, debug=False)
+# import json
+# import os
+# import re
+# import logging
+# import shutil
+# from dotenv import load_dotenv
+# from flask import Flask, request, jsonify, abort
+# from werkzeug.utils import secure_filename
+# from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+# import pytesseract
+# import cv2
+# import pdfplumber
+# import pandas as pd
+# from PIL import Image
+# from docx import Document
+# from flask_cors import CORS
+# from flask_executor import Executor
+# from sentence_transformers import SentenceTransformer
+# import faiss
+# import whisper
+# from PyPDF2 import PdfReader
+# from pdf2image import convert_from_path
+# from concurrent.futures import ThreadPoolExecutor
+# import tempfile
+# # Load environment variables
+# load_dotenv()
+# # Initialize Flask app
+# app = Flask(__name__)
+# CORS(app)
+# # Configure logging
+# logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+# # Configure upload directory and max file size
+# UPLOAD_DIR = os.getenv('UPLOAD_DIR', os.path.join(os.getcwd(), 'uploads'))
+# os.makedirs(UPLOAD_DIR, exist_ok=True)
+# app.config['UPLOAD_FOLDER'] = UPLOAD_DIR
+# app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024  # 16 MB max file size
+# # Initialize Flask-Executor for asynchronous tasks
+# executor = Executor(app)
+# whisper_model = whisper.load_model("tiny")
+# # Allowed file extensions
+# ALLOWED_AUDIO_EXTENSIONS = {'mp3', 'wav', 'flac'}
+# ALLOWED_DOCUMENT_EXTENSIONS = {'pdf', 'jpg', 'jpeg', 'png', 'docx', 'xlsx', 'xls'}
+# # Ensure ffmpeg is in PATH
+# ffmpeg_path = shutil.which("ffmpeg") or "C:\\ffmpeg\\bin\\ffmpeg.exe"
+# if not os.path.exists(ffmpeg_path):
+#     raise RuntimeError("FFmpeg not found! Please install FFmpeg and set the correct path.")
+# os.environ["PATH"] += os.pathsep + os.path.dirname(ffmpeg_path)
+# # Lazy model loading to save resources
+# class LazyModelLoader:
+#     def __init__(self, model_name, task, tokenizer=None):
+#         self.model_name = model_name
+#         self.task = task
+#         self.tokenizer = tokenizer
+#         self._model = None
+#     def load(self):
+#         """Load the model if not already loaded."""
+#         if self._model is None:
+#             logging.info(f"Loading model: {self.model_name}")
+#             if self.task == "text-generation":
+#                 self._model = AutoModelForCausalLM.from_pretrained(
+#                     self.model_name, device_map="auto", torch_dtype="auto"
+#                 )
+#                 self._tokenizer = AutoTokenizer.from_pretrained(self.model_name, legacy=False)
+#                 # Set pad_token_id if it's not already set
+#                 if self._model.generation_config.pad_token_id is None or self._model.generation_config.pad_token_id < 0:
+#                     if self._tokenizer.eos_token_id is not None:
+#                         self._model.generation_config.pad_token_id = self._tokenizer.eos_token_id
+#                         logging.info(f"Set pad_token_id to {self._tokenizer.eos_token_id}")
+#                     else:
+#                         logging.warning("No valid eos_token_id found. Setting pad_token_id to 0 as a fallback.")
+#                         self._model.generation_config.pad_token_id = 0
+#             else:
+#                 self._model = pipeline(self.task, model=self.model_name, tokenizer=self.tokenizer)
+#         return self._model
+# # Text extraction agents
+# class TextExtractorAgent:
+#     @staticmethod
+#     def extract_text(filepath, ext):
+#         """Extract text based on file type."""
+#         try:
+#             if ext == "pdf":
+#                 return TextExtractorAgent.extract_text_from_pdf(filepath)
+#             elif ext in {"jpg", "jpeg", "png"}:
+#                 return TextExtractorAgent.extract_text_from_image(filepath)
+#             elif ext == "docx":
+#                 return TextExtractorAgent.extract_text_from_docx(filepath)
+#             elif ext in {"xlsx", "xls"}:
+#                 return TextExtractorAgent.extract_text_from_excel(filepath)
+#             return None
+#         except Exception as e:
+#             logging.error(f"Text extraction failed: {e}")
+#             return None
+#     @staticmethod
+#     def extract_text_from_pdf(filepath):
+#         """Extract text from a PDF file."""
+#         text = ""
+#         with pdfplumber.open(filepath) as pdf:
+#             for page in pdf.pages:
+#                 page_text = page.extract_text()
+#                 if page_text:
+#                     text += page_text + "\n"
+#         return text.strip() or None
+#     @staticmethod
+#     def extract_text_from_image(filepath):
+#         """Extract text from an image using OCR."""
+#         image = cv2.imread(filepath)
+#         gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+#         _, processed = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+#         with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
+#             processed_path = temp_file.name
+#         cv2.imwrite(processed_path, processed)
+#         text = pytesseract.image_to_string(Image.open(processed_path), lang='eng')
+#         os.remove(processed_path)
+#         return text.strip() or None
+#     @staticmethod
+#     def extract_text_from_docx(filepath):
+#         """Extract text from a DOCX file."""
+#         doc = Document(filepath)
+#         text = "\n".join([para.text for para in doc.paragraphs])
+#         return text.strip() or None
+#     @staticmethod
+#     def extract_text_from_excel(filepath):
+#         """Extract text from an Excel file."""
+#         dfs = pd.read_excel(filepath, sheet_name=None)
+#         text = "\n".join([
+#             "\n".join([
+#                 " ".join(map(str, df[col].dropna()))
+#                 for col in df.columns
+#             ])
+#             for df in dfs.values()
+#         ])
+#         return text.strip() or None
+# # PHI scrubbing agent
+# class PHIScrubberAgent:
+#     @staticmethod
+#     def scrub_phi(text):
+#         """Remove sensitive personal health information (PHI)."""
+#         try:
+#             text = re.sub(r'\b(?:\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}\b', '[PHONE]', text)
+#             text = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w{2,4}\b', '[EMAIL]', text)
+#             text = re.sub(r'\b\d{3}-\d{2}-\d{4}\b', '[SSN]', text)
+#             text = re.sub(r'\b\d{1,5}\s+\w+\s+(Street|St|Avenue|Ave|Boulevard|Blvd|Road|Rd|Lane|Ln)\b', '[ADDRESS]', text, flags=re.IGNORECASE)
+#             text = re.sub(r'\bDr\.?\s+[A-Z][a-z]+\s+[A-Z][a-z]+\b', 'Dr. [NAME]', text)
+#             text = re.sub(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', '[NAME]', text)
+#         except Exception as e:
+#             logging.error(f"PHI scrubbing failed: {e}")
+#         return text
+# # Summarization agent
+# class SummarizerAgent:
+#     def __init__(self, summarization_model_loader):
+#         self.summarization_model_loader = summarization_model_loader
+#     def generate_summary(self, text):
+#         """Generate a summary of the provided text."""
+#         model = self.summarization_model_loader.load()
+#         try:
+#             summary_result = model(text, do_sample=False)
+#             return summary_result[0]['summary_text'].strip()
+#         except Exception as e:
+#             logging.error(f"Summary generation failed: {e}")
+#             return "Summary generation failed."
+# def allowed_file(filename, allowed_extensions):
+#     """Check if the file extension is allowed."""
+#     return '.' in filename and filename.rsplit('.', 1)[1].lower() in allowed_extensions
+# # Knowledge Base
+# class KnowledgeBase:
+#     def __init__(self, documents):
+#         self.embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
+#         self.documents = documents
+#         self.embeddings = self.embedding_model.encode(documents)
+#         self.dimension = self.embedding_model.get_sentence_embedding_dimension()
+#         self.index = faiss.IndexFlatL2(self.dimension)
+#         self.index.add(self.embeddings)
+#     def retrieve_relevant_info(self, query, top_k=3):
+#         """Retrieve relevant medical information from the knowledge base."""
+#         query_embedding = self.embedding_model.encode([query])
+#         distances, indices = self.index.search(query_embedding, top_k)
+#         relevant_texts = [self.documents[i] for i in indices[0]]
+#         return relevant_texts
+# # Medical data extraction agent
+# class MedicalDataExtractorAgent:
+#     def __init__(self, model_loader, knowledge_base):
+#         self.model_loader = model_loader
+#         self.knowledge_base = knowledge_base
+#     def retrieve_relevant_info(self, query, top_k=3):
+#         """Retrieve relevant medical information from the knowledge base."""
+#         query_embedding = self.knowledge_base.embedding_model.encode([query])
+#         distances, indices = self.knowledge_base.index.search(query_embedding, top_k)
+#         relevant_texts = [self.knowledge_base.documents[i] for i in indices[0]]
+#         return relevant_texts
+#     def extract_medical_data(self, text):
+#         """Extract structured medical data from text using Agentic RAG."""
+#         try:
+#             # Define the default JSON schema
+#             default_schema = {
+#                 "patient_name": "[NAME]",
+#                 "age": None,
+#                 "gender": None,
+#                 "diagnosis": [],
+#                 "symptoms": [],
+#                 "medications": [],
+#                 "allergies": [],
+#                 "vitals": {
+#                     "blood_pressure": None,
+#                     "heart_rate": None,
+#                     "temperature": None
+#                 },
+#                 "notes": ""
+#             }
+#             # Construct the prompt with the input text
+#             prompt = f"""
+#             ### Instruction:
+#             Extract structured medical data from the following text as a JSON whose parameters are enclosed in "" and without any \.
+#             The JSON should include patientname, age, gender, medications, allergies, diagnosis, symptoms, vitals, and notes.
+#             ### Text:
+#             {text}
+#             ### Response:
+#             """
+#             # Tokenize and generate the response
+#             model = self.model_loader.load()
+#             tokenizer = self.model_loader._tokenizer
+#             inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
+#             outputs = model.generate(
+#                 inputs.input_ids,
+#                 num_return_sequences=1,
+#                 temperature=0.7,
+#                 top_p=0.9,
+#                 do_sample=True
+#             )
+#             response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+#             logging.info(f"Model response: {response}")
+#             # Parse and normalize the JSON output
+#             json_start = response.find("{")
+#             json_end = response.rfind("}") + 1
+#             if json_start == -1 or json_end == -1:
+#                 raise ValueError("No JSON found in the model response.")
+#             # Extract the JSON substring
+#             structured_data = json.loads(response[json_start:json_end])
+#             # Normalize the JSON output
+#             normalized_data = self.normalize_json_output(structured_data, default_schema)
+#             # Ensure blood pressure is a string
+#             if normalized_data["vitals"]["blood_pressure"] and isinstance(normalized_data["vitals"]["blood_pressure"], str):
+#                 normalized_data["vitals"]["blood_pressure"] = normalized_data["vitals"]["blood_pressure"].strip('"')
+#             return json.dumps(normalized_data)
+#         except json.JSONDecodeError as e:
+#             logging.error(f"JSON parsing failed: {e}")
+#             return json.dumps({"error": f"Failed to parse JSON: {str(e)}"})
+#         except Exception as e:
+#             logging.error(f"Error extracting medical data: {e}")
+#             return json.dumps({"error": f"Failed to extract medical data: {str(e)}"})
+#     @staticmethod
+#     def normalize_json_output(model_output, default_schema):
+#         """
+#         Normalize the model's JSON output to match the default schema.
+#         """
+#         try:
+#             normalized_output = default_schema.copy()
+#             for key in normalized_output:
+#                 if key in model_output:
+#                     normalized_output[key] = model_output[key]
+#             return normalized_output
+#         except Exception as e:
+#             logging.error(f"Failed to normalize JSON: {e}")
+#             return default_schema  # Return the default schema in case of errors
+# # Initialize lazy loaders
+# medalpaca_model_loader = LazyModelLoader("lmsys/vicuna-7b-v1.5", "text-generation")
+# summarization_model_loader = LazyModelLoader("google-t5/t5-small", "summarization")
+# whisper_model = whisper.load_model("tiny")
+# # Initialize knowledge base
+# medical_documents = [
+#     "Hypertension is a chronic condition characterized by elevated blood pressure.",
+#     "Diabetes is a metabolic disorder that affects blood sugar levels.",
+#     "Common symptoms of chest pain include pressure, tightness, or discomfort in the chest."
+# ]
+# knowledge_base = KnowledgeBase(medical_documents)
+# # Initialize agents
+# text_extractor_agent = TextExtractorAgent()
+# phi_scrubber_agent = PHIScrubberAgent()
+# medical_data_extractor_agent = MedicalDataExtractorAgent(medalpaca_model_loader, knowledge_base)
+# summarizer_agent = SummarizerAgent(summarization_model_loader)
+# # API Endpoints
+# @app.route('/api/extract_medical_data', methods=['POST'])
+# def extract_medical_data():
+#     """Extract structured medical data from raw text."""
+#     try:
+#         data = request.json
+#         if "text" not in data or not data["text"].strip():
+#             return jsonify({"error": "No valid text provided"}), 400
+#         raw_text = data["text"]
+#         clean_text = phi_scrubber_agent.scrub_phi(raw_text)
+#         structured_data = medical_data_extractor_agent.extract_medical_data(clean_text)
+#         return jsonify(json.loads(structured_data)), 200
+#     except Exception as e:
+#         logging.error(f"Failed to extract medical data: {e}")
+#         return jsonify({"error": f"Extraction Error: {str(e)}"}), 500
+# @app.route('/api/transcribe', methods=['POST'])
+# def transcribe_audio():
+#     """Transcribe audio files into text."""
+#     if 'audio' not in request.files:
+#         abort(400, description="No audio file provided")
+#     audio_file = request.files['audio']
+#     if not allowed_file(audio_file.filename, ALLOWED_AUDIO_EXTENSIONS):
+#         abort(400, description="Invalid file type. Only mp3, wav, and flac files are allowed.")
+#     filename = secure_filename(audio_file.filename)
+#     audio_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
+#     audio_file.save(audio_path)
+#     try:
+#         result = whisper_model.transcribe(audio_path)
+#         transcribed_text = result["text"]
+#         os.remove(audio_path)
+#         return jsonify({"transcribed_text": transcribed_text}), 200
+#     except Exception as e:
+#         logging.error(f"Transcription failed: {str(e)}")
+#         return jsonify({"error": f"Transcription failed: {str(e)}"}), 500
+# @app.route('/api/generate_summary', methods=['POST'])
+# def generate_summary():
+#     """Generate a summary from the provided text."""
+#     data = request.json
+#     if "text" not in data or not data["text"].strip():
+#         return jsonify({"error": "No valid text provided"}), 400
+#     context = data["text"]
+#     clean_text = phi_scrubber_agent.scrub_phi(context)
+#     summary = summarizer_agent.generate_summary(clean_text)
+#     return jsonify({"summary": summary}), 200
+# @app.route('/api/extract_medical_data_from_audio', methods=['POST'])
+# def extract_medical_data_from_audio():
+#     """Extract medical data from transcribed audio."""
+#     if 'audio' not in request.files:
+#         abort(400, description="No audio file provided")
+#     audio_file = request.files['audio']
+#     if not allowed_file(audio_file.filename, ALLOWED_AUDIO_EXTENSIONS):
+#         abort(400, description="Invalid file type. Only mp3, wav, and flac files are allowed.")
+#     filename = secure_filename(audio_file.filename)
+#     audio_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
+#     audio_file.save(audio_path)
+#     try:
+#         result = whisper_model.transcribe(audio_path)
+#         transcribed_text = result["text"]
+#         clean_text = phi_scrubber_agent.scrub_phi(transcribed_text)
+#         summary = summarizer_agent.generate_summary(transcribed_text)
+#         structured_data = medical_data_extractor_agent.extract_medical_data(transcribed_text)
+#         response = {
+#             "transcribed_text": transcribed_text,
+#             "summary": summary,
+#             "medical_chart": json.loads(structured_data)
+#         }
+#         os.remove(audio_path)
+#         return jsonify(response), 200
+#     except Exception as e:
+#         logging.error(f"Processing failed: {str(e)}")
+#         return jsonify({"error": f"Processing failed: {str(e)}"}), 500
+# @app.route('/upload_document', methods=['POST'])
+# def upload_document():
+#     """Upload and extract text from documents."""
+#     if 'file' not in request.files:
+#         return jsonify({"error": "No file uploaded"}), 400
+#     file = request.files['file']
+#     if file.filename == '':
+#         return jsonify({"error": "No file selected"}), 400
+#     if file and allowed_file(file.filename, ALLOWED_DOCUMENT_EXTENSIONS):
+#         filename = secure_filename(file.filename)
+#         filepath = os.path.join(UPLOAD_DIR, filename)
+#         file.save(filepath)
+#         ext = filename.rsplit('.', 1)[1].lower()
+#         extracted_text = text_extractor_agent.extract_text(filepath, ext)
+#         if not extracted_text:
+#             return jsonify({"error": "No text found in file."}), 400
+#         response_data = {
+#             "file": filename,
+#             "extracted_text": extracted_text[:500],
+#             "message": "Click to extract medical terms"
+#         }
+#         os.remove(filepath)
+#         return jsonify(response_data), 200
+#     return jsonify({"error": "Invalid file type"}), 400
+# @app.route('/extract_medical_data_from_document', methods=['POST'])
+# def extract_medical_data_from_document():
+#     """Extract medical data from document text."""
+#     data = request.json
+#     if "text" not in data or not data["text"].strip():
+#         return jsonify({"error": "No valid text provided"}), 400
+#     context = data["text"]
+#     clean_text = phi_scrubber_agent.scrub_phi(context)
+#     structured_data = medical_data_extractor_agent.extract_medical_data(clean_text)
+#     return jsonify(json.loads(structured_data)), 200
+# if __name__ == '__main__':
+#     app.run(host='0.0.0.0', port=5000, debug=True)