Spaces:

salvinjose
/

HNTAI

Paused

App Files Files Community

dev-3 commited on Jun 9

Commit

f93d309

1 Parent(s): b9c4cf8

docker

Browse files

Files changed (8) hide show

.dockerignore +47 -0
DEPLOYMENT.md +4 -4
Dockerfile +13 -11
ai_med_extract/__main__.py +1 -1
ai_med_extract/app.py +1 -1
combined1.py +0 -880
document_based_extraction.py +0 -1188
speech_to_chart.py +0 -638

.dockerignore CHANGED Viewed

@@ -1,3 +1,50 @@
 # This file tells Hugging Face Spaces to use Docker
 # and exposes the correct port for Flask/Gradio/FastAPI
 # No further config needed if Dockerfile is present

 # This file tells Hugging Face Spaces to use Docker
 # and exposes the correct port for Flask/Gradio/FastAPI
 # No further config needed if Dockerfile is present
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+*.so
+*.egg-info/
+*.egg
+# Exclude datasets, model weights, and large files
+*.pt
+*.pth
+*.ckpt
+*.h5
+*.onnx
+*.npz
+*.npy
+*.tar.gz
+*.zip
+*.tar
+*.gz
+*.bz2
+*.7z
+*.rar
+# Exclude logs and outputs
+*.log
+*.out
+*.tmp
+*.swp
+# Exclude Jupyter notebooks (if not needed)
+*.ipynb
+# Exclude local environment files
+.env
+.venv/
+venv/
+# Exclude OS files
+.DS_Store
+Thumbs.db
+# Exclude other unnecessary folders/files
+node_modules/
+datasets/
+models/
+outputs/

DEPLOYMENT.md CHANGED Viewed

@@ -1,13 +1,13 @@
 # Hugging Face Spaces Docker deployment instructions
-# 1. Make sure your Dockerfile exposes port 5000 and runs your app on 0.0.0.0:5000
-# 2. Your Flask app should listen on host='0.0.0.0' and port=5000
 # 3. requirements.txt should include all dependencies
 # 4. .huggingface.yaml with 'runtime: docker' is present
 # 5. .dockerignore and .gitignore are present
 # To test locally:
 # docker build -t hntai-app .
-# docker run -p 5000:5000 hntai-app
-# Your app will be available at http://localhost:5000

 # Hugging Face Spaces Docker deployment instructions
+# 1. Make sure your Dockerfile exposes port 7860 and runs your app on 0.0.0.0:7860
+# 2. Your Flask app should listen on host='0.0.0.0' and port=7860
 # 3. requirements.txt should include all dependencies
 # 4. .huggingface.yaml with 'runtime: docker' is present
 # 5. .dockerignore and .gitignore are present
 # To test locally:
 # docker build -t hntai-app .
+# docker run -p 7860:7860 hntai-app
+# Your app will be available at http://localhost:7860

Dockerfile CHANGED Viewed

@@ -1,7 +1,6 @@
-# Use a lightweight Python base image
 FROM python:3.10-slim
-# Install system dependencies
 RUN apt-get update && apt-get install -y \
     build-essential \
     pkg-config \
@@ -17,25 +16,28 @@ RUN apt-get update && apt-get install -y \
     libgl1 \
     && rm -rf /var/lib/apt/lists/*
-# Set the working directory
 WORKDIR /app
 # Create uploads directory and set permissions
 RUN mkdir -p /app/uploads && chmod 777 /app/uploads
 # Copy only dependency files first for better caching
-COPY requirements.txt .
 # Install pip and dependencies
 RUN pip install --upgrade pip \
- && pip install -r requirements.txt --no-cache-dir --retries 10 --timeout 120
-# Copy rest of your code (this is after deps so doesn't bust cache)
 COPY . .
-# Expose port 5000 (required by HF Spaces)
-EXPOSE 5000
 # Run the Flask app
-CMD ["gunicorn", "-b", "0.0.0.0:5000", "ai_med_extract.app:app"]

 FROM python:3.10-slim
+# Install system dependencies and build tools
 RUN apt-get update && apt-get install -y \
     build-essential \
     pkg-config \
     libgl1 \
     && rm -rf /var/lib/apt/lists/*
 WORKDIR /app
 # Create uploads directory and set permissions
 RUN mkdir -p /app/uploads && chmod 777 /app/uploads
 # Copy only dependency files first for better caching
+COPY requirements.txt .
 # Install pip and dependencies
 RUN pip install --upgrade pip \
+ && pip install -r requirements.txt --no-cache-dir \
+ # Remove build tools and clean up to reduce image size
+ && apt-get remove -y build-essential pkg-config libsystemd-dev libcairo2-dev \
+ && apt-get autoremove -y \
+ && apt-get clean \
+ && rm -rf /var/lib/apt/lists/*
+# Copy the rest of your code
 COPY . .
+# Expose port 7860 (required by HF Spaces)
+EXPOSE 7860
 # Run the Flask app
+CMD ["gunicorn", "-b", "0.0.0.0:7860", "ai_med_extract.app:app"]

ai_med_extract/__main__.py CHANGED Viewed

@@ -2,4 +2,4 @@ from .app import app
 # Entrypoint for running the app as a module
 if __name__ == "__main__":
-    app.run(host="0.0.0.0", port=5000, debug=True)

 # Entrypoint for running the app as a module
 if __name__ == "__main__":
+    app.run(host="0.0.0.0", port=7860, debug=True)

ai_med_extract/app.py CHANGED Viewed

@@ -56,4 +56,4 @@ from .api.routes import register_routes
 register_routes(app, agents)
 if __name__ == "__main__":
-    app.run(host="0.0.0.0", port=5000, debug=True)

 register_routes(app, agents)
 if __name__ == "__main__":
+    app.run(host="0.0.0.0", port=7860, debug=True)

combined1.py DELETED Viewed

@@ -1,880 +0,0 @@
-import json
-import os
-import re
-import logging
-from dotenv import load_dotenv
-from flask import Flask, request, jsonify, abort
-from werkzeug.utils import secure_filename
-from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
-import pytesseract
-import cv2
-import pdfplumber
-import pandas as pd
-from PIL import Image
-from docx import Document
-from flask_cors import CORS
-from flask_executor import Executor
-from sentence_transformers import SentenceTransformer
-import faiss
-import whisper
-from PyPDF2 import PdfReader
-from pdf2image import convert_from_path
-from concurrent.futures import ThreadPoolExecutor
-import tempfile
-import tensorflow.keras.layers as KL  # Instead of keras.layers as KL
-import numpy as np
-# Load environment variables
-load_dotenv()
-# Set Tesseract OCR Path
-pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
-# Initialize Flask app
-app = Flask(__name__)
-CORS(app)
-# Configure logging
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
-# Configure upload directory and max file size
-UPLOAD_DIR = os.getenv('UPLOAD_DIR', os.path.join(os.getcwd(), 'uploads'))
-os.makedirs(UPLOAD_DIR, exist_ok=True)
-app.config['UPLOAD_FOLDER'] = UPLOAD_DIR
-app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024  # 16 MB max file size
-# Initialize Flask-Executor for asynchronous tasks
-executor = Executor(app)
-whisper_model = whisper.load_model("tiny")
-# Allowed file extensions
-ALLOWED_AUDIO_EXTENSIONS = {'mp3', 'wav', 'flac'}
-ALLOWED_DOCUMENT_EXTENSIONS = {'pdf', 'jpg', 'jpeg', 'png', 'docx', 'xlsx', 'xls'}
-UPLOAD_FOLDER = 'Uploads'
-ALLOWED_EXTENSIONS = {'pdf', 'jpg', 'jpeg', 'png', 'svg', 'docx', 'doc'}
-app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
-# Set file size limits
-MAX_SIZE_PDF_DOCS = 1 * 1024 * 1024 * 1024  # 1GB
-MAX_SIZE_IMAGES = 500 * 1024 * 1024  # 500MB
-# Lazy model loading to save resources
-class LazyModelLoader:
-    def __init__(self, model_name, task, tokenizer=None):
-        self.model_name = model_name
-        self.task = task
-        self.tokenizer = tokenizer
-        self._model = None
-    def load(self):
-        """Load the model if not already loaded."""
-        if self._model is None:
-            logging.info(f"Loading model: {self.model_name}")
-            if self.task == "text-generation":
-                self._model = AutoModelForCausalLM.from_pretrained(
-                    self.model_name, device_map="auto", torch_dtype="auto"
-                )
-                self._tokenizer = AutoTokenizer.from_pretrained(self.model_name, legacy=False)
-                if self._model.generation_config.pad_token_id is None or self._model.generation_config.pad_token_id < 0:
-                    if self._tokenizer.eos_token_id is not None:
-                        self._model.generation_config.pad_token_id = self._tokenizer.eos_token_id
-                        logging.info(f"Set pad_token_id to {self._tokenizer.eos_token_id}")
-                    else:
-                        logging.warning("No valid eos_token_id found. Setting pad_token_id to 0 as a fallback.")
-                        self._model.generation_config.pad_token_id = 0
-            else:
-                self._model = pipeline(self.task, model=self.model_name, tokenizer=self.tokenizer)
-        return self._model
-# Text extraction agents
-class TextExtractorAgent:
-    @staticmethod
-    def extract_text(filepath, ext):
-        """Extract text based on file type."""
-        try:
-            if ext == "pdf":
-                return TextExtractorAgent.extract_text_from_pdf(filepath)
-            elif ext in {"jpg", "jpeg", "png"}:
-                return TextExtractorAgent.extract_text_from_image(filepath)
-            elif ext == "docx":
-                return TextExtractorAgent.extract_text_from_docx(filepath)
-            elif ext in {"xlsx", "xls"}:
-                return TextExtractorAgent.extract_text_from_excel(filepath)
-            return None
-        except Exception as e:
-            logging.error(f"Text extraction failed: {e}")
-            return None
-    @staticmethod
-    def extract_text_from_pdf(filepath):
-        """Extract text from a PDF file."""
-        text = ""
-        with pdfplumber.open(filepath) as pdf:
-            for page in pdf.pages:
-                page_text = page.extract_text()
-                if page_text:
-                    text += page_text + "\n"
-        return text.strip() or None
-    @staticmethod
-    def extract_text_from_image(filepath):
-        """Extract text from an image using OCR."""
-        image = cv2.imread(filepath)
-        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-        _, processed = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
-        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
-            processed_path = temp_file.name
-        cv2.imwrite(processed_path, processed)
-        text = pytesseract.image_to_string(Image.open(processed_path), lang='eng')
-        os.remove(processed_path)
-        return text.strip() or None
-    @staticmethod
-    def extract_text_from_docx(filepath):
-        """Extract text from a DOCX file."""
-        doc = Document(filepath)
-        text = "\n".join([para.text for para in doc.paragraphs])
-        return text.strip() or None
-    @staticmethod
-    def extract_text_from_excel(filepath):
-        """Extract text from an Excel file."""
-        dfs = pd.read_excel(filepath, sheet_name=None)
-        text = "\n".join([
-            "\n".join([
-                " ".join(map(str, df[col].dropna()))
-                for col in df.columns
-            ])
-            for df in dfs.values()
-        ])
-        return text.strip() or None
-# PHI scrubbing agent
-class PHIScrubberAgent:
-    @staticmethod
-    def scrub_phi(text):
-        """Remove sensitive personal health information (PHI)."""
-        try:
-            text = re.sub(r'\b(?:\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}\b', '[PHONE]', text)
-            text = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w{2,4}\b', '[EMAIL]', text)
-            text = re.sub(r'\b\d{3}-\d{2}-\d{4}\b', '[SSN]', text)
-            text = re.sub(r'\b\d{1,5}\s+\w+\s+(Street|St|Avenue|Ave|Boulevard|Blvd|Road|Rd|Lane|Ln)\b', '[ADDRESS]', text, flags=re.IGNORECASE)
-            text = re.sub(r'\bDr\.?\s+[A-Z][a-z]+\s+[A-Z][a-z]+\b', 'Dr. [NAME]', text)
-            text = re.sub(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', '[NAME]', text)
-        except Exception as e:
-            logging.error(f"PHI scrubbing failed: {e}")
-        return text
-# Summarization agent
-class SummarizerAgent:
-    def __init__(self, summarization_model_loader):
-        self.summarization_model_loader = summarization_model_loader
-    def generate_summary(self, text):
-        """Generate a summary of the provided text."""
-        model = self.summarization_model_loader.load()
-        try:
-            summary_result = model(text, do_sample=False)
-            return summary_result[0]['summary_text'].strip()
-        except Exception as e:
-            logging.error(f"Summary generation failed: {e}")
-            return "Summary generation failed."
-def allowed_file(filename, allowed_extensions=ALLOWED_EXTENSIONS):
-    """Check if the file extension is allowed."""
-    return '.' in filename and filename.rsplit('.', 1)[1].lower() in allowed_extensions
-# Knowledge Base
-class KnowledgeBase:
-    def __init__(self, documents):
-        self.embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
-        self.documents = documents
-        self.embeddings = self.embedding_model.encode(documents)
-        self.dimension = self.embedding_model.get_sentence_embedding_dimension()
-        self.index = faiss.IndexFlatL2(self.dimension)
-        self.index.add(self.embeddings)
-    def retrieve_relevant_info(self, query, top_k=3):
-        """Retrieve relevant medical information from the knowledge base."""
-        query_embedding = self.embedding_model.encode([query])
-        distances, indices = self.index.search(query_embedding, top_k)
-        relevant_texts = [self.documents[i] for i in indices[0]]
-        return relevant_texts
-# Medical data extraction agent
-class MedicalDataExtractorAgent:
-    def __init__(self, model_loader, knowledge_base):
-        self.model_loader = model_loader
-        self.knowledge_base = knowledge_base
-    def retrieve_relevant_info(self, query, top_k=3):
-        """Retrieve relevant medical information from the knowledge base."""
-        query_embedding = self.knowledge_base.embedding_model.encode([query])
-        distances, indices = self.knowledge_base.index.search(query_embedding, top_k)
-        relevant_texts = [self.knowledge_base.documents[i] for i in indices[0]]
-        return relevant_texts
-    def extract_medical_data(self, text):
-        """Extract structured medical data from text using Agentic RAG."""
-        try:
-            default_schema = {
-                "patient_name": "[NAME]",
-                "age": None,
-                "gender": None,
-                "diagnosis": [],
-                "symptoms": [],
-                "medications": [],
-                "allergies": [],
-                "vitals": {
-                    "blood_pressure": None,
-                    "heart_rate": None,
-                    "temperature": None
-                },
-                "notes": ""
-            }
-            prompt = f"""
-            ### Instruction:
-            Extract structured medical data from the following text as a JSON whose parameters are enclosed in "" and without any \.
-            The JSON should include patientname, age, gender, medications, allergies, diagnosis, symptoms, vitals, and notes.
-            ### Text:
-            {text}
-            ### Response:
-            """
-            model = self.model_loader.load()
-            tokenizer = self.model_loader._tokenizer
-            inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
-            outputs = model.generate(
-                inputs.input_ids,
-                num_return_sequences=1,
-                temperature=0.7,
-                top_p=0.9,
-                do_sample=True
-            )
-            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-            logging.info(f"Model response: {response}")
-            json_start = response.find("{")
-            json_end = response.rfind("}") + 1
-            if json_start == -1 or json_end == -1:
-                raise ValueError("No JSON found in the model response.")
-            structured_data = json.loads(response[json_start:json_end])
-            normalized_data = self.normalize_json_output(structured_data, default_schema)
-            if normalized_data["vitals"]["blood_pressure"] and isinstance(normalized_data["vitals"]["blood_pressure"], str):
-                normalized_data["vitals"]["blood_pressure"] = normalized_data["vitals"]["blood_pressure"].strip('"')
-            return json.dumps(normalized_data)
-        except json.JSONDecodeError as e:
-            logging.error(f"JSON parsing failed: {e}")
-            return json.dumps({"error": f"Failed to parse JSON: {str(e)}"})
-        except Exception as e:
-            logging.error(f"Error extracting medical data: {e}")
-            return json.dumps({"error": f"Failed to extract medical data: {str(e)}"})
-    @staticmethod
-    def normalize_json_output(model_output, default_schema):
-        """Normalize the model's JSON output to match the default schema."""
-        try:
-            normalized_output = default_schema.copy()
-            for key in normalized_output:
-                if key in model_output:
-                    normalized_output[key] = model_output[key]
-            return normalized_output
-        except Exception as e:
-            logging.error(f"Failed to normalize JSON: {e}")
-            return default_schema
-# Initialize lazy loaders
-medalpaca_model_loader = LazyModelLoader(
-    model_name="stanford-crfm/BioMedLM",
-    task="text-generation"
-)
-summarization_model_loader = LazyModelLoader("google-t5/t5-small", "summarization")
-# Initialize knowledge base
-medical_documents = [
-    "Hypertension is a chronic condition characterized by elevated blood pressure.",
-    "Diabetes is a metabolic disorder that affects blood sugar levels.",
-    "Common symptoms of chest pain include pressure, tightness, or discomfort in the chest."
-]
-knowledge_base = KnowledgeBase(medical_documents)
-# Initialize agents
-text_extractor_agent = TextExtractorAgent()
-phi_scrubber_agent = PHIScrubberAgent()
-medical_data_extractor_agent = MedicalDataExtractorAgent(medalpaca_model_loader, knowledge_base)
-summarizer_agent = SummarizerAgent(summarization_model_loader)
-# NER to Detect medical info
-CONFIDENCE_THRESHOLD = 0.80
-def extract_medical_entities(text, ner_pipeline):
-    if not text or not text.strip():
-        return ["No medical entities found"]
-    if ner_pipeline is None:
-        print("⚠️ NER model is not loaded, skipping entity extraction.")
-        return ["No medical entities found"]
-    ner_results = ner_pipeline(text)
-    relevant_entities = {
-        "Disease", "MedicalCondition", "Symptom", "Sign_or_Symptom",
-        "B-DISEASE", "I-DISEASE",
-        "Test", "Measurement", "B-TEST", "I-TEST", "Lab_value", "B-Lab_value", "I-Lab_value",
-        "Medication", "B-MEDICATION", "I-MEDICATION", "Treatment",
-        "Procedure", "B-Diagnostic_procedure", "I-Diagnostic_procedure",
-        "Anatomical_site", "Body_Part", "Organ_or_Tissue",
-        "Diagnostic_procedure", "Surgical_Procedure", "Therapeutic_Procedure",
-        "Health_condition", "B-Health_condition", "I-Health_condition",
-        "Pathological_Condition", "Clinical_Event",
-        "Chemical_Substance", "B-Chemical_Substance", "I-Chemical_Substance",
-        "Biological_Entity", "B-Biological_Entity", "I-Biological_Entity"
-    }
-    medical_entities = set()
-    for ent in ner_results:
-        entity_label = ent.get("entity_group") or ent.get("entity")
-        if entity_label in relevant_entities and ent["score"] >= CONFIDENCE_THRESHOLD:
-            word = ent["word"].lower().strip().replace("-", "")
-            if len(word) > 2:
-                medical_entities.add(word)
-    if len(medical_entities) >= 5:
-        return list(medical_entities)
-    return ["No medical entities found"]
-# Validation: Check File Size
-def check_file_size(file):
-    file.seek(0, os.SEEK_END)
-    size = file.tell()
-    file.seek(0)
-    extension = file.filename.rsplit('.', 1)[-1].lower()
-    if extension in {'pdf', 'docx'} and size > MAX_SIZE_PDF_DOCS:
-        return False, f"File {file.filename} exceeds 1GB size limit"
-    elif extension in {'jpg', 'jpeg', 'png'} and size > MAX_SIZE_IMAGES:
-        return False, f"Image {file.filename} exceeds 500MB size limit"
-    return True, None
-def extract_patient_name(text, qa_pipeline):
-    """Extracts patient name using the given QA pipeline."""
-    if not text or not qa_pipeline:
-        return None
-    try:
-        result = qa_pipeline(
-            question="What is the patient's name?",
-            context=text
-        )
-        return result.get("answer", "").strip()
-    except Exception as e:
-        print(f"⚠️ Error extracting patient name: {e}")
-        return None
-def normalize_name(name):
-    """Cleans and normalizes names for comparison, removing salutations dynamically."""
-    if not name:
-        return ""
-    name = name.lower().strip()
-    name = re.sub(r"[^\w\s]", "", name)
-    name = re.sub(r"^\b\w{1,5}\b\s+", "", name)
-    return name
-def validate_patient_name(extracted_text, patient_name, filename, qa_pipeline):
-    """Validates if the extracted name matches the registered patient name."""
-    detected_name = extract_patient_name(extracted_text, qa_pipeline)
-    if not detected_name:
-        return jsonify({"error": f"Could not determine patient name from {filename}"}), 400
-    normalized_detected_name = normalize_name(detected_name)
-    normalized_patient_name = normalize_name(patient_name)
-    if normalized_detected_name not in normalized_patient_name:
-        return jsonify({
-            "error": f"Document '{filename}' does not belong to {patient_name}. Found: {detected_name}"
-        }), 400
-    return None
-def is_blurred(image_path, variance_threshold=150):
-    try:
-        image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
-        if image is None:
-            print(f"❌ Error: Unable to read image {image_path}")
-            return True
-        laplacian_var = cv2.Laplacian(image, cv2.CV_64F).var()
-        print(f"🔍 Blur Check: Variance={laplacian_var} (Threshold={variance_threshold})")
-        edges = cv2.Canny(image, 50, 150)
-        edge_density = np.mean(edges)
-        print(f"📏 Edge Density: {edge_density}")
-        return laplacian_var < variance_threshold and edge_density < 10
-    except Exception as e:
-        print(f"❌ Error detecting blur: {e}")
-        return True
-def extract_text_from_image(filepath):
-    try:
-        if is_blurred(filepath):
-            return "Image is too blurry, OCR failed."
-        image = cv2.imread(filepath)
-        if image is None:
-            return "Image could not be read."
-        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-        gray = cv2.GaussianBlur(gray, (5, 5), 0)
-        gray = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
-                                     cv2.THRESH_BINARY, 11, 2)
-        kernel = np.ones((2,2), np.uint8)
-        gray = cv2.dilate(gray, kernel, iterations=1)
-        processed_path = f"{filepath}_processed.png"
-        cv2.imwrite(processed_path, gray)
-        text = pytesseract.image_to_string(Image.open(processed_path), lang='eng').strip()
-        words = text.split()
-        if len(words) < 5:
-            return "OCR failed to extract meaningful text."
-        return text
-    except Exception as e:
-        print(f"❌ Error processing {filepath}: {e}")
-        return "Failed to extract text"
-def extract_text_from_pdf(filepath, password=None):
-    """Extract text from PDFs using pdfplumber (faster) or OCR (if needed)."""
-    text = ""
-    try:
-        reader = PdfReader(filepath)
-        if reader.is_encrypted:
-            if not password:
-                print("🔒 PDF is encrypted but no password was provided.")
-                return {"error": "File is password-protected. Please provide a password."}, 401
-            decryption_result = reader.decrypt(password)
-            if decryption_result == 0:
-                print("❌ Incorrect password provided!")
-                return {"error": "Invalid password provided."}, 403
-            else:
-                print("✅ PDF successfully decrypted!")
-            text = "\n".join([page.extract_text() or "" for page in reader.pages])
-            if text.strip():
-                return text.strip(), 200
-        with pdfplumber.open(filepath) as pdf:
-            for page in pdf.pages:
-                page_text = page.extract_text()
-                if page_text:
-                    text += page_text + "\n"
-        if text.strip():
-            return text.strip(), 200
-        images = convert_from_path(filepath)
-        with ThreadPoolExecutor(max_workers=5) as pool:
-            ocr_text = list(pool.map(lambda img: pytesseract.image_to_string(img, lang='eng'), images))
-        return ("\n".join(ocr_text).strip(), 200) if ocr_text else ("No text found", 415)
-    except Exception as e:
-        print(f"❌ Error processing PDF {filepath}: {e}")
-        return "Failed to extract text"
-def extract_text_from_docx(filepath):
-    doc = Document(filepath)
-    text = "\n".join([para.text for para in doc.paragraphs])
-    return text.strip() or None
-def clean_result(value):
-    value = re.sub(r"\s+", " ", value)
-    value = re.sub(r"[-_:]+", " ", value)
-    value = re.sub(r"[^\x00-\x7F]+", " ", value)
-    return value if value else "Not Available"
-def mask_sensitive_info(text):
-    text = re.sub(r'(?<=\b\w{2})\w+(?=\s\w{2,})', '***', text)
-    text = re.sub(r'\b(\d{2})\d{2}-(\d{2})\d{2}-(\d{2})\d{2}\b', r'**\2-**\3-**', text)
-    text = re.sub(r'\b(\d{8})(\d{2})\b', r'********\2', text)
-    return text
-# API Endpoints
-@app.route('/extract_medical_data', methods=['POST'])
-def extract_medical_data():
-    """Extract structured medical data from raw text."""
-    try:
-        data = request.json
-        if "text" not in data or not data["text"].strip():
-            return jsonify({"error": "No valid text provided"}), 400
-        raw_text = data["text"]
-        clean_text = phi_scrubber_agent.scrub_phi(raw_text)
-        structured_data = medical_data_extractor_agent.extract_medical_data(clean_text)
-        return jsonify(json.loads(structured_data)), 200
-    except Exception as e:
-        logging.error(f"Failed to extract medical data: {e}")
-        return jsonify({"error": f"Extraction Error: {str(e)}"}), 500
-@app.route('/api/transcribe', methods=['POST'])
-def transcribe_audio():
-    """Transcribe audio files into text."""
-    if 'audio' not in request.files:
-        abort(400, description="No audio file provided")
-    audio_file = request.files['audio']
-    if not allowed_file(audio_file.filename, ALLOWED_AUDIO_EXTENSIONS):
-        abort(400, description="Invalid file type. Only mp3, wav, and flac files are allowed.")
-    filename = secure_filename(audio_file.filename)
-    audio_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
-    audio_file.save(audio_path)
-    try:
-        result = whisper_model.transcribe(audio_path)
-        transcribed_text = result["text"]
-        os.remove(audio_path)
-        return jsonify({"transcribed_text": transcribed_text}), 200
-    except Exception as e:
-        logging.error(f"Transcription failed: {str(e)}")
-        return jsonify({"error": f"Transcription failed: {str(e)}"}), 500
-@app.route('/api/generate_summary', methods=['POST'])
-def generate_summary():
-    """Generate a summary from the provided text."""
-    data = request.json
-    if "text" not in data or not data["text"].strip():
-        return jsonify({"error": "No valid text provided"}), 400
-    context = data["text"]
-    clean_text = phi_scrubber_agent.scrub_phi(context)
-    summary = summarizer_agent.generate_summary(clean_text)
-    return jsonify({"summary": summary}), 200
-@app.route('/api/extract_medical_data_from_audio', methods=['POST'])
-def extract_medical_data_from_audio():
-    """Extract medical data from transcribed audio."""
-    if 'audio' not in request.files:
-        abort(400, description="No audio file provided")
-    audio_file = request.files['audio']
-    if not allowed_file(audio_file.filename, ALLOWED_AUDIO_EXTENSIONS):
-        abort(400, description="Invalid file type. Only mp3, wav, and flac files are allowed.")
-    logging.info(audio_file.filename)
-    logging.info(app.config['UPLOAD_FOLDER'])
-    filename = secure_filename(audio_file.filename)
-    logging.info(filename)
-    audio_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
-    logging.info(audio_path)
-    audio_file.save(audio_path)
-    try:
-        result = whisper_model.transcribe(audio_path)
-        transcribed_text = result["text"]
-        clean_text = phi_scrubber_agent.scrub_phi(transcribed_text)
-        summary = summarizer_agent.generate_summary(transcribed_text)
-        structured_data = medical_data_extractor_agent.extract_medical_data(transcribed_text)
-        response = {
-            "transcribed_text": transcribed_text,
-            "summary": summary,
-            "medical_chart": json.loads(structured_data)
-        }
-        os.remove(audio_path)
-        return jsonify(response), 200
-    except Exception as e:
-        logging.error(f"Processing failed: {str(e)}")
-        return jsonify({"error": f"Processing failed: {str(e)}"}), 500
-@app.route('/upload', methods=['POST'])
-def upload_file():
-    files = request.files.getlist("file")
-    patient_name = request.form.get("patient_name", "").strip()
-    password = request.form.get("password")
-    qa_model_name = request.form.get("qa_model_name")
-    qa_model_type = request.form.get("qa_model_type")
-    ner_model_name = request.form.get("ner_model_name")
-    ner_model_type = request.form.get("ner_model_type")
-    summarizer_model_name = request.form.get("summarizer_model_name")
-    summarizer_model_type = request.form.get("summarizer_model_type")
-    if not files:
-        return jsonify({"error": "No file uploaded"}), 400
-    try:
-        qa_pipeline = pipeline(task=qa_model_type, model=qa_model_name)
-        print(f"✅ QA Model Loaded: {qa_model_name}")
-    except Exception as e:
-        return jsonify({"error": f"QA model load failed: {str(e)}"}), 500
-    try:
-        ner_pipeline = pipeline(task=ner_model_type, model=ner_model_name)
-        print(f"✅ NER Model Loaded: {ner_model_name}")
-    except Exception as e:
-        return jsonify({"error": f"NER model load failed: {str(e)}"}), 500
-    try:
-        summarizer_pipeline = pipeline(task=summarizer_model_type, model=summarizer_model_name)
-        print(f"✅ Summarizer Model Loaded: {summarizer_model_name}")
-    except Exception as e:
-        return jsonify({"error": f"Summarizer model load failed: {str(e)}"}), 500
-    extracted_data = []
-    print(patient_name)
-    for file in files:
-        if file.filename == '':
-            continue
-        if not allowed_file(file.filename):
-            return jsonify({"error": f"Unsupported file type: {file.filename}. Supported file types are: {', '.join(ALLOWED_EXTENSIONS)}"}), 400
-        if not patient_name:
-            return jsonify({"error": "Patient name is missing"}), 400
-        valid_size, error_message = check_file_size(file)
-        if not valid_size:
-            return jsonify({"error": error_message}), 400
-        filename = secure_filename(file.filename)
-        filepath = os.path.join(UPLOAD_FOLDER, filename)
-        file.save(filepath)
-        extracted_text = None
-        if filename.endswith(".pdf"):
-            result = extract_text_from_pdf(filepath, password)
-            if isinstance(result, tuple):
-                extracted_text, status_code = result
-            else:
-                extracted_text = result
-                status_code = 200
-            if isinstance(extracted_text, dict) and "error" in extracted_text:
-                return jsonify(extracted_text), status_code
-        elif filename.endswith(".docx"):
-            extracted_text = extract_text_from_docx(filepath)
-        elif filename.endswith((".jpg", ".jpeg", ".png", ".svg")):
-            extracted_text = extract_text_from_image(filepath)
-        if not extracted_text or extracted_text == "No text found":
-            return jsonify({"error": f"Failed to extract text from {filename}"}), 415
-        if extracted_text in ["Image is too blurry, OCR failed.", "OCR failed to extract meaningful text."]:
-            return jsonify({"error": f"'{filename}' is too blurry or text is unreadable."}), 422
-        skip_medical_check = request.form.get("skip_medical_check", "false").lower() == "true"
-        if not skip_medical_check:
-            ner_results = ner_pipeline(extracted_text)
-            medical_entities = list(set([r["word"] for r in ner_results if r["entity"].startswith("B-") or r["entity"].startswith("I-")]))
-            print(f"Medical entities found: {medical_entities}")
-            if not medical_entities:
-                return jsonify({"error": f"'{filename}' is not medically relevant"}), 406
-        else:
-            print(f"Skipping Medical Validation for {filename}")
-        skip_patient_check = request.form.get("skip_patient_check", "false").lower() == "true"
-        if not skip_patient_check:
-            try:
-                error_response = validate_patient_name(extracted_text, patient_name, filename, qa_pipeline)
-                if error_response:
-                    return error_response
-            except Exception as e:
-                return jsonify({"error": f"Patient name validation failed: {str(e)}"}), 500
-        else:
-            print(f"Skipping Patient Name Validation for {filename}")
-        try:
-            summary = summarizer_pipeline(extracted_text, max_length=350, min_length=50, do_sample=False)[0]["summary_text"]
-        except Exception as e:
-            summary = "Summary failed"
-            print(f"⚠️ Error summarizing: {e}")
-        extracted_data.append({
-            "file": filename,
-            "extracted_text": extracted_text,
-            "summary": summary,
-            "message": "Successful"
-        })
-        extracted_text = None
-        summary = None
-    if not extracted_data:
-        return jsonify({"error": "No valid medical files processed"}), 400
-    return jsonify({"extracted_data": extracted_data}), 200
-@app.route('/extract_medical_data_questions', methods=['POST'])
-def extract_medical_data_questions():
-    """Extract medical data based on predefined questions."""
-    data = request.json
-    qa_model_name = data.get("qa_model_name")
-    qa_model_type = data.get("qa_model_type")
-    if "extracted_data" not in data:
-        return jsonify({"error": "Missing 'extracted_data' in request"}), 400
-    if not qa_model_name or not qa_model_type:
-        return jsonify({"error": "Missing 'model_name' or 'model_type'"}), 400
-    try:
-        print(f"🌀 Loading model: {qa_model_name} ({qa_model_type})")
-        qa_pipeline = pipeline(task=qa_model_type, model=qa_model_name)
-        loaded_model_name = qa_pipeline.model.config._name_or_path
-        loaded_model_type = qa_pipeline.task
-        print(f"✅ Model loaded: {loaded_model_name}")
-    except Exception as e:
-        print("❌ Error loading model:", str(e))
-        return jsonify({"error": f"Could not load model: {str(e)}"}), 500
-    questions = {
-        "Patient Name": "What is the patient's name?",
-        "Age": "What is the patient's age?",
-        "Gender": "What is the patient's gender?",
-        "Date of Birth": "What is the patient's date of birth?",
-        "Patient ID": "What is the patient ID?",
-        "Reason for Visit": "What is the reason for the patient's visit?",
-        "Physician": "Who is the physician in charge of the patient?",
-        "Test Date": "What is the test date?",
-        "Hemoglobin": "What is the patient's hemoglobin level?",
-        "Blood Glucose (Fasting)": "What is the patient's fasting blood glucose level?",
-        "Total Cholesterol": "What is the total cholesterol level?",
-        "LDL Cholesterol": "What is the LDL cholesterol level?",
-        "HDL Cholesterol": "What is the HDL cholesterol level?",
-        "Serum Creatinine": "What is the serum creatinine level?",
-        "Vitamin D (25-OH)": "What is the patient's Vitamin D level?",
-        "Height": "What is the patient's height?",
-        "Weight": "What is the patient's weight?",
-        "Blood Pressure (Systolic)": "What is the patient's systolic blood pressure?",
-        "Blood Pressure (Diastolic)": "What is the patient's diastolic blood pressure?",
-        "Recommendations": "What are the recommendations based on the test results?"
-    }
-    structured_response = {"extracted_data": []}
-    for file_data in data["extracted_data"]:
-        filename = file_data["file"]
-        context = file_data["extracted_text"]
-        if not context:
-            structured_response["extracted_data"].append({
-                "file": filename,
-                "medical_terms": "No data extracted",
-            })
-            continue
-        extracted_info = {}
-        for key, question in questions.items():
-            try:
-                result = qa_pipeline(question=question, context=context)
-                extracted_info[key] = clean_result(result.get("answer", "Not Available"))
-            except:
-                extracted_info[key] = "Error extracting"
-        categorized_data = [
-            {
-                "name": "Patient Information",
-                "fields": [
-                    {"label": "Patient Name", "value": extracted_info.get("Patient Name", "")},
-                    {"label": "Date of Birth", "value": extracted_info.get("Date of Birth", "")},
-                    {"label": "Gender", "value": extracted_info.get("Gender", "")},
-                    {"label": "Patient ID", "value": extracted_info.get("Patient ID", "")}
-                ]
-            },
-            {
-                "name": "Vitals",
-                "fields": [
-                    {"label": "Height", "value": extracted_info.get("Height", "")},
-                    {"label": "Weight", "value": extracted_info.get("Weight", "")},
-                    {"label": "Blood Pressure", "value": f"{extracted_info.get('Blood Pressure (Systolic)', '')}/{extracted_info.get('Blood Pressure (Diastolic)', '')} mmHg"},
-                    {"label": "Hemoglobin", "value": extracted_info.get("Hemoglobin", "")},
-                    {"label": "Serum Creatinine", "value": extracted_info.get("Serum Creatinine", "")}
-                ]
-            },
-            {
-                "name": "Lab Results",
-                "fields": [
-                    {"label": "Blood Glucose (Fasting)", "value": extracted_info.get("Blood Glucose (Fasting)", "")},
-                    {"label": "Total Cholesterol", "value": extracted_info.get("Total Cholesterol", "")},
-                    {"label": "LDL Cholesterol", "value": extracted_info.get("LDL Cholesterol", "")},
-                    {"label": "HDL Cholesterol", "value": extracted_info.get("HDL Cholesterol", "")},
-                    {"label": "Vitamin D (25-OH)", "value": extracted_info.get("Vitamin D (25-OH)", "")}
-                ]
-            },
-            {
-                "name": "Medical Notes",
-                "fields": [
-                    {"label": "Reason for Visit", "value": extracted_info.get("Reason for Visit", "")},
-                    {"label": "Physician", "value": extracted_info.get("Physician", "")},
-                    {"label": "Test Date", "value": extracted_info.get("Test Date", "")},
-                    {"label": "Recommendations", "value": extracted_info.get("Recommendations", "")}
-                ]
-            }
-        ]
-        structured_response["extracted_data"].append({
-            "file": filename,
-            "medical_terms": extracted_info,
-            "categorized_data": categorized_data,
-            "model_used": loaded_model_name,
-            "model_type": loaded_model_type
-        })
-        save_data_to_storage(filename, structured_response)
-        print(f"✅ Extracted data saved to: {os.path.join(UPLOAD_FOLDER, f'{filename}.json')}")
-    return jsonify(structured_response), 200
-def get_data_from_storage(filename):
-    try:
-        filepath = os.path.join(UPLOAD_FOLDER, f"{filename}.json")
-        print(f"🔍 Looking for file at: {filepath}")
-        if not os.path.exists(filepath):
-            print(f"🚫 File not found at: {filepath}")
-            return None
-        with open(filepath, "r") as file:
-            data = json.load(file)
-        print(f"✅ File found and loaded: {filepath}")
-        return data
-    except Exception as e:
-        print(f"🚨 Error loading data: {e}")
-        return None
-def save_data_to_storage(filename, data):
-    try:
-        filename = filename.rsplit(".", 1)[0]
-        filepath = os.path.join(UPLOAD_FOLDER, f"{filename}.json")
-        print(f"Saving to: {filepath}")
-        print(f"Directory exists: {os.path.exists(UPLOAD_FOLDER)}")
-        if not os.path.exists(UPLOAD_FOLDER):
-            print(f"Directory not found. Creating: {UPLOAD_FOLDER}")
-            os.makedirs(UPLOAD_FOLDER, exist_ok=True)
-        with open(filepath, "w") as file:
-            json.dump(data, file)
-            print(f"✅ Data saved successfully to {filepath}")
-    except Exception as e:
-        print(f"🚨 Exception during save: {e}")
-@app.route('/get_updated_medical_data', methods=['GET'])
-def get_updated_data():
-    file_name = request.args.get('file')
-    if not file_name:
-        return jsonify({"error": "File name is required"}), 400
-    file_name = file_name.rsplit(".", 1)[0]
-    updated_data = get_data_from_storage(file_name)
-    if updated_data:
-        return jsonify({"file": file_name, "data": updated_data}), 200
-    else:
-        return jsonify({"error": f"File '{file_name}' not found"}), 404
-@app.route('/update_medical_data', methods=['PUT'])
-def update_medical_data():
-    try:
-        data = request.json
-        print("Received data:", data)
-        filename = data.get("file")
-        filename = filename.rsplit(".", 1)[0]
-        updates = data.get("updates", [])
-        if not filename or not updates:
-            return jsonify({"error": "File name or updates missing"}), 400
-        existing_data = get_data_from_storage(filename)
-        if not existing_data:
-            return jsonify({"error": f"File '{filename}' not found"}), 404
-        for update in updates:
-            category = update.get("category")
-            field = update.get("field")
-            new_value = update.get("value")
-            updated = False
-            for cat in existing_data.get("extracted_data", []):
-                for categorized_data in cat.get("categorized_data", []):
-                    if categorized_data.get("name") == category:
-                        for fld in categorized_data.get("fields", []):
-                            if fld.get("label") == field:
-                                print(f"🔄 Updating {category} -> {field} from '{fld['value']}' to '{new_value}'")
-                                fld["value"] = new_value
-                                updated = True
-                                break
-                    if updated:
-                        break
-                if updated:
-                    break
-        save_data_to_storage(filename, existing_data)
-        print("✅ Updated data:", existing_data)
-        return jsonify({"message": "Data updated successfully", "updated_data": existing_data}), 200
-    except Exception as e:
-        print("❌ Error:", str(e))
-        return jsonify({"error": str(e)}), 500
-@app.route('/')
-def home():
-    return "Medical Data Extraction API is running!"
-if __name__ == '__main__':
-    app.run(host='0.0.0.0', port=5000, debug=True)

document_based_extraction.py DELETED Viewed

@@ -1,1188 +0,0 @@
-import os, re, json
-import time, logging, functools
-import pytesseract
-import cv2
-import pdfplumber
-import numpy as np
-from PIL import Image
-from PyPDF2 import PdfReader
-from pdf2image import convert_from_path
-from flask import Flask, request, jsonify
-from flask_cors import CORS
-import torch
-from werkzeug.utils import secure_filename
-from docx import Document
-from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from collections import defaultdict
-from huggingface_hub import login
-# -------------------- Logging Config -------------------- #
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s - %(levelname)s - %(message)s",
-    handlers=[
-        logging.FileHandler("app.log"),
-        logging.StreamHandler()
-    ]
-)
-logger = logging.getLogger(__name__)
-# -------------------- Execution Time Decorator -------------------- #
-def log_execution_time(level=logging.INFO):
-    def decorator(func):
-        @functools.wraps(func)
-        def wrapper(*args, **kwargs):
-            start_time = time.time()
-            try:
-                result = func(*args, **kwargs)
-                duration = time.time() - start_time
-                logger.log(level, f"⏱️ {func.__name__} executed in {duration:.6f} seconds")
-                return result
-            except Exception as e:
-                duration = time.time() - start_time
-                logger.exception(f"❌ Exception in {func.__name__} after {duration:.6f} seconds: {e}")
-                raise
-        return wrapper
-    return decorator
-login(
-    "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"
-)  # 🧠 This will store it and every model load will use it
-executor = ThreadPoolExecutor(max_workers=5)
-logger.info("Executor initialized with 5 workers")
-# Set Tesseract OCR Path
-# in Windows
-# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
-# in Linux
-pytesseract.pytesseract.tesseract_cmd = "/usr/local/bin/tesseract"
-# Set up Flask app
-app = Flask(__name__)
-CORS(app)
-UPLOAD_FOLDER = "uploads"
-ALLOWED_EXTENSIONS = {"pdf", "jpg", "jpeg", "png", "svg", "docx", "doc"}
-app.config["UPLOAD_FOLDER"] = UPLOAD_FOLDER
-# Set file size limits
-MAX_SIZE_PDF_DOCS = 1 * 1024 * 1024 * 1024  # *1GB*
-MAX_SIZE_IMAGES = 500 * 1024 * 1024  # *500MB*
-# # Load ClinicalBERT Model for Classification
-# try:
-#     zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
-#     print("✅ zero_shot_classifier Model Loaded Successfully")
-# except Exception as e:
-#     zero_shot_classifier = None
-#     print("❌ Error loading ClinicalBERT Model:", str(e))
-if not os.path.exists(UPLOAD_FOLDER):
-    os.makedirs(UPLOAD_FOLDER, exist_ok=True)
-#   NER to Detect medical info
-CONFIDENCE_THRESHOLD = 0.80
-@log_execution_time()
-def extract_medical_entities(text):
-    if not text or not text.strip():
-        return ["No medical entities found"]
-    if ner_pipeline is None:  # type: ignore
-        logger.warning("NER model is not loaded, skipping entity extraction.")
-        return ["No medical entities found"]
-    ner_results = ner_pipeline(text)  # type: ignore
-    relevant_entities = {
-        # Diseases & Symptoms
-        "Disease",
-        "MedicalCondition",
-        "Symptom",
-        "Sign_or_Symptom",
-        "B-DISEASE",
-        "I-DISEASE",
-        # Tests, Measurements, and Lab Values
-        "Test",
-        "Measurement",
-        "B-TEST",
-        "I-TEST",
-        "Lab_value",
-        "B-Lab_value",
-        "I-Lab_value",
-        # Medications, Treatments, and Procedures
-        "Medication",
-        "B-MEDICATION",
-        "I-MEDICATION",
-        "Treatment",
-        "Procedure",
-        "B-Diagnostic_procedure",
-        "I-Diagnostic_procedure",
-        # Body Parts & Medical Anatomy
-        "Anatomical_site",
-        "Body_Part",
-        "Organ_or_Tissue",
-        # Medical Procedures
-        "Diagnostic_procedure",
-        "Surgical_Procedure",
-        "Therapeutic_Procedure",
-        # Clinical Terms
-        "Health_condition",
-        "B-Health_condition",
-        "I-Health_condition",
-        "Pathological_Condition",
-        "Clinical_Event",
-        # Biological & Chemical Substances (Relevant to Lab Reports)
-        "Chemical_Substance",
-        "B-Chemical_Substance",
-        "I-Chemical_Substance",
-        "Biological_Entity",
-        "B-Biological_Entity",
-        "I-Biological_Entity",
-    }
-    medical_entities = set()
-    for ent in ner_results:
-        entity_label = ent.get("entity_group") or ent.get("entity")
-        if entity_label in relevant_entities and ent["score"] >= CONFIDENCE_THRESHOLD:
-            word = ent["word"].lower().strip().replace("-", "")  # Normalize text
-            if len(word) > 2:  # Ignore short/junk words
-                medical_entities.add(word)
-    if len(medical_entities) >= 5:
-        logger.info(f"Extracted {len(medical_entities)} medical entities")
-        return list(medical_entities)
-    logger.info("Not enough medical entities found")
-    return ["No medical entities found"]
-# Validation: Check Allowed File Types
-def allowed_file(filename):
-    return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
-# Validation: Check File Size
-def check_file_size(file):
-    file.seek(0, os.SEEK_END)
-    size = file.tell()
-    file.seek(0)
-    extension = file.filename.rsplit(".", 1)[-1].lower()
-    logger.info(f"Checking file size for '{file.filename}' - Size: {size} bytes")
-    if extension in {"pdf", "docx"} and size > MAX_SIZE_PDF_DOCS:
-        logger.warning(f"{file.filename} exceeds 1GB limit")
-        return False, f"File {file.filename} exceeds 1MB size limit"
-    elif extension in {"jpg", "jpeg", "png"} and size > MAX_SIZE_IMAGES:
-        logger.warning(f"{file.filename} exceeds 500MB image limit")
-        return False, f"Image {file.filename} exceeds 500KB size limit"
-    return True, None
-@log_execution_time()
-def extract_patient_name(text, qa_pipeline):
-    if not text or not qa_pipeline:
-        return None
-    try:
-        result = qa_pipeline(question="What is the patient's name?", context=text)
-        answer = result.get("answer", "").strip()
-        logger.info(f"Extracted patient name: {answer}")
-        return answer
-    except Exception as e:
-        logger.error(f"Error extracting patient name: {e}")
-        return None
-def normalize_name(name):
-    """Cleans and normalizes names for comparison, removing salutations dynamically"""
-    if not name:
-        return ""
-    name = name.lower().strip()
-    name = re.sub(r"[^\w\s]", "", name)
-    name = re.sub(r"^\b\w{1,5}\b\s+", "", name)  # Matches short words at the start
-    return name
-@log_execution_time()
-def validate_patient_name(extracted_text, patient_name, filename, qa_pipeline):
-    """Validates if the extracted name matches the registered patient name"""
-    detected_name = extract_patient_name(extracted_text, qa_pipeline)
-    if not detected_name:
-        logger.warning(f"Could not determine patient name from {filename}")
-        return (
-            jsonify({"error": f"Could not determine patient name from {filename}"}),
-            400,
-        )
-    normalized_detected_name = normalize_name(detected_name)
-    normalized_patient_name = normalize_name(patient_name)
-    if normalized_detected_name not in normalized_patient_name:
-        logger.warning(
-            f"Patient mismatch in file '{filename}': Found '{detected_name}'"
-        )
-        return (
-            jsonify(
-                {
-                    "error": f"Document '{filename}' does not belong to {patient_name}. Found: {detected_name}"
-                }
-            ),
-            400,
-        )
-    logger.info(f"Patient name validation passed for '{filename}'")
-    return None  # No error, validation passed
-# Check if the image is blurred using the Laplacian method
-def is_blurred(image_path, variance_threshold=150):
-    try:
-        image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
-        if image is None:
-            logger.error(f"Unable to read image: {image_path}")
-            return True  # Assume it's blurry if not readable
-        # Compute Laplacian variance
-        laplacian_var = cv2.Laplacian(image, cv2.CV_64F).var()
-        logger.info(
-            f"Blur Check on '{image_path}': Laplacian Variance = {laplacian_var:.2f} (Threshold = {variance_threshold})"
-        )
-        # Compute Edge Density (Additional Check)
-        edges = cv2.Canny(image, 50, 150)
-        edge_density = np.mean(edges)
-        logger.info(f"Edge Density for '{image_path}': {edge_density:.2f}")
-        is_blurry = laplacian_var < variance_threshold and edge_density < 10
-        if is_blurry:
-            logger.warning(f"Image '{image_path}' flagged as blurry.")
-        return is_blurry
-    except Exception as e:
-        logger.exception(f"Exception during blur detection for '{image_path}': {e}")
-        return True  # Assume it's blurry on failure
-# Helper Function: Extract Text from Images (OCR) with Blur Detection
-@log_execution_time()
-def extract_text_from_image(filepath):
-    try:
-        # Check if the image is blurry
-        if is_blurred(filepath):
-            logger.warning(f"OCR skipped: '{filepath}' is too blurry.")
-            return "Image is too blurry, OCR failed."
-        image = cv2.imread(filepath)
-        if image is None:
-            logger.error(f"OCR failed: Unable to read image '{filepath}'.")
-            return "Image could not be read."
-        # Convert to Grayscale and Apply Thresholding
-        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-        gray = cv2.GaussianBlur(gray, (5, 5), 0)
-        gray = cv2.adaptiveThreshold(
-            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
-        )
-        # Apply dilation (bolds the text) for better OCR accuracy
-        kernel = np.ones((2, 2), np.uint8)
-        gray = cv2.dilate(gray, kernel, iterations=1)
-        processed_path = f"{filepath}_processed.png"
-        cv2.imwrite(processed_path, gray)
-        logger.info(f"Image preprocessed and saved: {processed_path}")
-        text = pytesseract.image_to_string(
-            Image.open(processed_path), lang="eng"
-        ).strip()
-        # Validate OCR output (Reject if too little text is extracted)
-        word_count = len(text.split())
-        logger.info(
-            f"OCR completed for '{filepath}' with {word_count} words extracted."
-        )
-        if word_count < 5:
-            logger.warning(f"OCR output too small for '{filepath}'. Might be junk.")
-            return "OCR failed to extract meaningful text."
-        return text
-    except Exception as e:
-        logger.exception(f"Error extracting text from image '{filepath}': {e}")
-        return "Failed to extract text"
-# Helper Function: Extract Text from PDF
-@log_execution_time()
-def extract_text_from_pdf(filepath, password=None):
-    """Extract text from PDFs using pdfplumber (faster) or OCR (if needed)."""
-    text = ""
-    try:
-        logger.info(f"Starting PDF extraction: {filepath}")
-        reader = PdfReader(filepath)
-        if reader.is_encrypted:
-            if not password:
-                logger.warning("Encrypted PDF without password.")
-                return {
-                    "error": "File is password-protected. Please provide a password."
-                }, 401
-            # ✅ Attempt to decrypt
-            decryption_result = reader.decrypt(password)
-            if decryption_result == 0:  # Decryption failed
-                logger.error("Incorrect password provided.")
-                return {"error": "Invalid password provided."}, 403
-            else:
-                logger.info("PDF decryption successful.")
-            text = "\n".join([page.extract_text() or "" for page in reader.pages])
-            if text.strip():
-                logger.info("Text extracted from decrypted PDF.")
-                return text.strip(), 200
-        # ✅ Now, use pdfplumber for text extraction
-        with pdfplumber.open(filepath) as pdf:
-            for page in pdf.pages:
-                page_text = page.extract_text()
-                if page_text:
-                    text += page_text + "\n"
-        if text.strip():
-            logger.info(
-                f"PDF text extracted using pdfplumber: {len(text.split())} words."
-            )
-            return text.strip(), 200  # ✅ Always return a tuple (text, status)
-        logger.info("No text found via pdfplumber. Falling back to OCR.")
-        # ✅ Use OCR if the PDF has no selectable text
-        images = convert_from_path(filepath)
-        with ThreadPoolExecutor(max_workers=5) as pool:
-            ocr_text = list(
-                pool.map(
-                    lambda img: pytesseract.image_to_string(img, lang="eng"), images
-                )
-            )
-        full_ocr_text = "\n".join(ocr_text).strip()
-        logger.info(
-            f"OCR fallback complete for PDF: {len(full_ocr_text.split())} words extracted."
-        )
-        return (full_ocr_text, 200) if full_ocr_text else ("No text found", 415)
-    except Exception as e:
-        logger.exception(f"Error during PDF processing: {filepath}")
-        return "Failed to extract text"
-# Helper Function: Extract Text from DOCX
-@log_execution_time()
-def extract_text_from_docx(filepath):
-    try:
-        doc = Document(filepath)
-        text = "\n".join([para.text for para in doc.paragraphs])
-        word_count = len(text.split())
-        logger.info(f"DOCX extracted from '{filepath}': {word_count} words.")
-        return text.strip() or None
-    except Exception as e:
-        logger.exception(f"Failed to extract text from DOCX: {filepath}")
-        return None
-# Masking function to hide sensitive data
-def mask_sensitive_info(text):
-    text = re.sub(r"(?<=\b\w{2})\w+(?=\s\w{2,})", "*", text)  # Mask names
-    text = re.sub(
-        r"\b(\d{2})\d{2}-(\d{2})\d{2}-(\d{2})\d{2}\b", r"\2-\3-", text
-    )  # Mask DOB
-    text = re.sub(r"\b(\d{8})(\d{2})\b", r"\2", text)  # Mask phone numbers
-    return text
-# ------------------Upload Documents ------------------ #
-# API Route: Upload File & Extract Text
-@app.route("/upload", methods=["POST"])
-@log_execution_time()
-def upload_file():
-    logger.info("📥 Upload request received")
-    files = request.files.getlist("file")
-    patient_name = request.form.get("patient_name", "").strip()
-    password = request.form.get("password")  # Get password if provided
-    # Dynamic model info from form
-    qa_model_name = request.form.get("qa_model_name")
-    qa_model_type = request.form.get("qa_model_type")
-    ner_model_name = request.form.get("ner_model_name")
-    ner_model_type = request.form.get("ner_model_type")
-    summarizer_model_name = request.form.get("summarizer_model_name")
-    summarizer_model_type = request.form.get("summarizer_model_type")
-    if not files:
-        logger.warning("No file uploaded")
-        return jsonify({"error": "No file uploaded"}), 400
-    # 🔌 Load models dynamically
-    try:
-        qa_pipeline = pipeline(task=qa_model_type, model=qa_model_name)
-        logger.info(f"✅ QA model loaded: {qa_model_name}")
-    except Exception as e:
-        logger.error(f"❌ QA model load failed: {e}")
-        return jsonify({"error": f"QA model load failed: {str(e)}"}), 500
-    try:
-        ner_pipeline = pipeline(task=ner_model_type, model=ner_model_name)
-        logger.info(f"✅ NER model loaded: {ner_model_name}")
-    except Exception as e:
-        logger.error(f"❌ NER model load failed: {e}")
-        return jsonify({"error": f"NER model load failed: {str(e)}"}), 500
-    try:
-        summarizer_pipeline = pipeline(
-            task=summarizer_model_type, model=summarizer_model_name
-        )
-        logger.info(f"✅ Summarizer model loaded: {summarizer_model_name}")
-    except Exception as e:
-        logger.error(f"❌ Summarizer model load failed: {e}")
-        return jsonify({"error": f"Summarizer model load failed: {str(e)}"}), 500
-    extracted_data = []
-    print(patient_name)
-    for file in files:
-        logger.info(f"📂 Processing file: {file.filename}")
-        if file.filename == "":
-            logger.warning("Skipping unnamed file")
-            continue  # Skip empty file names
-        if not allowed_file(file.filename):
-            logger.warning(f"Unsupported file type: {file.filename}")
-            return (
-                jsonify(
-                    {
-                        "error": f"Unsupported file type: {file.filename}. Supported file types are: {', '.join(ALLOWED_EXTENSIONS)}"
-                    }
-                ),
-                400,
-            )
-        if not patient_name:
-            logger.warning("Patient name missing")
-            return jsonify({"error": "Patient name is missing"}), 400
-        # *Check file size*
-        valid_size, error_message = check_file_size(file)
-        if not valid_size:
-            logger.warning(f"❌ File size validation failed: {error_message}")
-            return jsonify({"error": error_message}), 400
-        filename = secure_filename(file.filename)
-        filepath = os.path.join(UPLOAD_FOLDER, filename)
-        file.save(filepath)
-        logger.info(f"✅ File saved: {filepath}")
-        extracted_text = None
-        # ✅ *Extract text based on file type*
-        if filename.endswith(".pdf"):
-            logger.info("🧾 Extracting text from PDF")
-            result = extract_text_from_pdf(filepath, password)
-            # ✅ If PDF requires a password, return 401
-            if isinstance(result, tuple):
-                extracted_text, status_code = result
-            else:
-                extracted_text = result
-                status_code = 200
-            if isinstance(extracted_text, dict) and "error" in extracted_text:
-                logger.warning(f"⚠️ PDF extraction error: {extracted_text}")
-                return jsonify(extracted_text), status_code
-        elif filename.endswith(".docx"):
-            extracted_text = extract_text_from_docx(filepath)
-        elif filename.endswith((".jpg", ".jpeg", ".png", ".svg")):
-            logger.info("🖼️ Extracting text from image")
-            extracted_text = extract_text_from_image(filepath)
-        if not extracted_text or extracted_text == "No text found":
-            logger.warning(f"⚠️ No text extracted from {filename}")
-            return (
-                jsonify({"error": f"Failed to extract text from {filename}"}),
-                415,
-            )  # Unsupported Media Type
-        # reject blurred images
-        if extracted_text in [
-            "Image is too blurry, OCR failed.",
-            "OCR failed to extract meaningful text.",
-        ]:
-            logger.warning(f"🔍 OCR failed or image too blurry: {filename}")
-            return (
-                jsonify(
-                    {"error": f"'{filename}' is too blurry or text is unreadable."}
-                ),
-                422,
-            )  # Unprocessable Entity
-        # ✅ Medical Validation using NER
-        skip_medical_check = (
-            request.form.get("skip_medical_check", "false").lower() == "true"
-        )
-        if not skip_medical_check:
-            logger.info("🧠 Running NER medical validation")
-            start_time = time.time()
-            ner_results = ner_pipeline(extracted_text)
-            medical_entities = list(
-                set(
-                    [
-                        r["word"]
-                        for r in ner_results
-                        if r["entity"].startswith("B-") or r["entity"].startswith("I-")
-                    ]
-                )
-            )
-            elapsed_time = time.time() - start_time
-            logger.info(f"⏱️ Medical entity validation took {elapsed_time:.2f}s")
-            logger.info(f"🩺 Medical entities found: {medical_entities}")
-            if not medical_entities:
-                logger.warning(f"❌ No medical relevance in {filename}")
-                return (
-                    jsonify({"error": f"'{filename}' is not medically relevant"}),
-                    406,
-                )
-        else:
-            logger.info(f"⏭️ Skipping medical validation for {filename}")
-        # # ✅ Patient Name Validation using QA
-        # skip_patient_check = request.form.get("skip_patient_check", "false").lower() == "true"
-        # if not skip_patient_check:
-        #     try:
-        #         logger.info("🧍 Validating patient name")
-        #         start_time = time.time()
-        #         error_response = validate_patient_name(extracted_text, patient_name, filename,qa_pipeline)
-        #         elapsed_time = time.time() - start_time
-        #         logger.info(f"⏱️ Patient name validation took {elapsed_time:.2f}s")
-        #         if error_response:
-        #             return error_response
-        #     except Exception as e:
-        #         logger.error(f"❌ Patient name validation failed: {e}")
-        #         return jsonify({"error": f"Patient name validation failed: {str(e)}"}), 500
-        # else:
-        #     logger.info(f"⏭️ Skipping patient name validation for {filename}")
-        # ✨ Generate Summary using Summarizer
-        try:
-            logger.info("📝 Generating summary: %s", extracted_text)
-            start_time = time.time()
-            summary = summarizer_pipeline(
-                extracted_text, max_length=350, min_length=50, do_sample=False
-            )[0]["summary_text"]
-            elapsed_time = time.time() - start_time
-            logger.info(f"✅ Summary generated: {summary}")
-            logger.info(f"⏱️ Summary generation took {elapsed_time:.2f} seconds")
-        except Exception as e:
-            summary = "Summary failed"
-            logger.warning(f"⚠ Summary generation failed: {e}")
-        #  # Classify report type
-        # report_type = classify_medical_document(extracted_text)
-        # print(report_type)
-        # ✅ Summarize extracted text
-        extracted_data.append(
-            {
-                "file": filename,
-                # "document_type": report_type,
-                "extracted_text": extracted_text,
-                "summary": summary,
-                "message": "Successful",
-            }
-        )
-        logger.info(f"✅ Finished processing file: {filename}")
-    if not extracted_data:
-        logger.warning("❌ No valid medical files processed")
-        return jsonify({"error": "No valid medical files processed"}), 400
-    logger.info("📦 Upload processing completed successfully")
-    return jsonify({"extracted_data": extracted_data}), 200
-# # API Route: Extract Medical Data Based on Predefined Questions
-# @app.route('/extract_medical_data', methods=['POST'])
-# def extract_medical_data():
-#     data = request.json
-#     print(f"📥 Incoming request data: {data}")
-#     qa_model_name = data.get("qa_model_name")
-#     qa_model_type = data.get("qa_model_type")
-#     if "extracted_data" not in data:
-#         return jsonify({"error": "Missing 'extracted_data' in request"}), 400
-#     if not qa_model_name or not qa_model_type:
-#         return jsonify({"error": "Missing 'model_name' or 'model_type'"}), 400
-#     try:
-#         print(f"🌀 Loading model: {qa_model_name} ({qa_model_type})")
-#         qa_pipeline = pipeline(task=qa_model_type, model=qa_model_name)
-#         print(f"✅ Model loaded: {qa_pipeline.model.config._name_or_path}")
-#     except Exception as e:
-#         print("❌ Error loading model:", str(e))
-#         return jsonify({"error": f"Could not load model: {str(e)}"}), 500
-#     questions = {
-#         "Patient Name": "What is the patient's name?",
-#         "Age": "What is the patient's age?",
-#         "Gender": "What is the patient's gender?",
-#         "Date of Birth": "What is the patient's date of birth?",
-#         "Patient ID": "What is the patient ID?",
-#         "Reason for Visit": "What is the reason for the patient's visit?",
-#         "Physician": "Who is the physician in charge of the patient?",
-#         "Test Date": "What is the test date?",
-#         "Hemoglobin": "What is the patient's hemoglobin level?",
-#         "Blood Glucose (Fasting)": "What is the patient's fasting blood glucose level?",
-#         "Total Cholesterol": "What is the total cholesterol level?",
-#         "LDL Cholesterol": "What is the LDL cholesterol level?",
-#         "HDL Cholesterol": "What is the HDL cholesterol level?",
-#         "Serum Creatinine": "What is the serum creatinine level?",
-#         "Vitamin D (25-OH)": "What is the patient's Vitamin D level?",
-#         "Height": "What is the patient's height?",
-#         "Weight": "What is the patient's weight?",
-#         "Blood Pressure (Systolic)": "What is the patient's systolic blood pressure?",
-#         "Blood Pressure (Diastolic)": "What is the patient's diastolic blood pressure?",
-#         "Recommendations": "What are the recommendations based on the test results?"
-#     }
-#     structured_response = {"extracted_data": []}
-#     for file_data in data["extracted_data"]:
-#         filename = file_data["file"]
-#         context = file_data["extracted_text"]
-#         if not context:
-#             structured_response["extracted_data"].append({
-#                 "file": filename,
-#                 "medical_terms": "No data extracted"
-#             })
-#             continue
-#         # Prepare batch QA input
-#         qa_inputs = [
-#             {"question": q, "context": context}
-#             for q in questions.values()
-#         ]
-#         try:
-#             qa_outputs = qa_pipeline(qa_inputs)
-#             print("📤 Batch QA outputs:", qa_outputs)
-#         except Exception as e:
-#             print("⚠️ Batch failed, falling back to loop:", str(e))
-#             qa_outputs = [qa_pipeline(q) for q in qa_inputs]
-#         # Map answers back to questions
-#         extracted_info = {}
-#         for i, key in enumerate(questions.keys()):
-#             answer = qa_outputs[i].get("answer", "").strip()
-#             score = qa_outputs[i].get("score", 0.0)
-#             # If the model returns an empty string or very low confidence, mark as "Not Mentioned"
-#             if not answer or score < 0.1:
-#                 extracted_info[key] = "Not Mentioned"
-#             else:
-#                 extracted_info[key] = answer
-#         # Optional: Clean results
-#         # extracted_info = {k: clean_result(v) for k, v in extracted_info.items()}
-#         categorized_data = [
-#             {
-#                 "name": "Patient Information",
-#                 "fields": [
-#                     {"label": "Patient Name", "value": extracted_info.get("Patient Name", "")},
-#                     {"label": "Date of Birth", "value": extracted_info.get("Date of Birth", "")},
-#                     {"label": "Gender", "value": extracted_info.get("Gender", "")},
-#                     {"label": "Patient ID", "value": extracted_info.get("Patient ID", "")}
-#                 ]
-#             },
-#             {
-#                 "name": "Vitals",
-#                 "fields": [
-#                     {"label": "Height", "value": extracted_info.get("Height", "")},
-#                     {"label": "Weight", "value": extracted_info.get("Weight", "")},
-#                     {"label": "Blood Pressure", "value": f"{extracted_info.get('Blood Pressure (Systolic)', '')}/{extracted_info.get('Blood Pressure (Diastolic)', '')} mmHg"},
-#                     {"label": "Hemoglobin", "value": extracted_info.get("Hemoglobin", "")},
-#                     {"label": "Serum Creatinine", "value": extracted_info.get("Serum Creatinine", "")}
-#                 ]
-#             },
-#             {
-#                 "name": "Lab Results",
-#                 "fields": [
-#                     {"label": "Blood Glucose (Fasting)", "value": extracted_info.get("Blood Glucose (Fasting)", "")},
-#                     {"label": "Total Cholesterol", "value": extracted_info.get("Total Cholesterol", "")},
-#                     {"label": "LDL Cholesterol", "value": extracted_info.get("LDL Cholesterol", "")},
-#                     {"label": "HDL Cholesterol", "value": extracted_info.get("HDL Cholesterol", "")},
-#                     {"label": "Vitamin D (25-OH)", "value": extracted_info.get("Vitamin D (25-OH)", "")}
-#                 ]
-#             },
-#             {
-#                 "name": "Medical Notes",
-#                 "fields": [
-#                     {"label": "Reason for Visit", "value": extracted_info.get("Reason for Visit", "")},
-#                     {"label": "Physician", "value": extracted_info.get("Physician", "")},
-#                     {"label": "Test Date", "value": extracted_info.get("Test Date", "")},
-#                     {"label": "Recommendations", "value": extracted_info.get("Recommendations", "")}
-#                 ]
-#             }
-#         ]
-#         structured_response["extracted_data"].append({
-#             "file": filename,
-#             "medical_terms": extracted_info,
-#             "categorized_data": categorized_data
-#         })
-#         save_data_to_storage(filename, structured_response)
-#         print(f"✅ Extracted data saved to: {os.path.join(UPLOAD_FOLDER, f'{filename}.json')}")
-#     return jsonify(structured_response)
-# ------------------ CLEAN FUNCTION  ------------------ #
-@log_execution_time()
-def clean_result(value):
-    logger.debug("Cleaning value: %s", value)
-    if isinstance(value, str):
-        value = re.sub(r"\s+", " ", value)
-        value = re.sub(r"[-_:]+", " ", value)
-        value = re.sub(r"[^\x00-\x7F]+", " ", value)
-        value = re.sub(
-            r"(?<=\d),(?=\d)", "", value
-        )  # Remove commas in numbers like 250,000
-        return value.strip() if value.strip() else "Not Available"
-    elif isinstance(value, list):
-        cleaned = [clean_result(v) for v in value if v is not None]
-        return cleaned if cleaned else ["Not Available"]
-    elif isinstance(value, dict):
-        return {k: clean_result(v) for k, v in value.items()}
-    return value
-# ------------------Group by Category ------------------ #
-@log_execution_time()
-def group_by_category(data):
-    logger.info("Grouping extracted items by category")
-    grouped = defaultdict(list)
-    category_times = {}
-    for item in data:
-        cat = item.get("category", "General")
-        start_time = time.time()
-        grouped[cat].append(
-            {
-                "question": item.get("question", "Not Created"),
-                "label": item.get("label", "Unknown"),
-                "answer": item.get("answer", "Not Available"),
-            }
-        )
-        elapsed = time.time() - start_time
-        category_times[cat] = category_times.get(cat, 0) + elapsed
-    for cat, details in grouped.items():
-        logger.info(f"📂 Category '{cat}': {len(details)} items, time taken: {category_times[cat]:.4f}s")
-    return [{"category": k, "detail": v} for k, v in grouped.items()]
-# ------------------detect duplicate to remove it  ------------------ #
-@log_execution_time()
-def deduplicate_extractions(data):
-    logger.info("Deduplicating extracted data")
-    seen = set()
-    unique = []
-    for item in data:
-        # Use a tuple of key fields to detect duplicates
-        key = (item.get("label"))
-        if key not in seen:
-            seen.add(key)
-            unique.append(item)
-    return unique
-# Load tokenizer outside the route for performance
-tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B")
-# -----------------------------Split text into overlapping chunks---------------#
-@log_execution_time()
-def chunk_text(text, tokenizer, max_tokens=512, overlap=50):
-    """
-    Splits text into overlapping token-based chunks without using NLTK.
-    Args:
-        text (str): Raw input text.
-        tokenizer (transformers tokenizer): Hugging Face tokenizer instance.
-        max_tokens (int): Max tokens per chunk.
-        overlap (int): Number of overlapping tokens between chunks.
-    Returns:
-        List[str]: List of decoded text chunks.
-    """
-    # Tokenize the full text
-    logger.info("Splitting text into chunks")
-    input_ids = tokenizer.encode(text, add_special_tokens=False)
-    chunks = []
-    start = 0
-    while start < len(input_ids):
-        end = start + max_tokens
-        chunk_ids = input_ids[start:end]
-        chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
-        # Ensure partial continuation isn't cut off mid-sentence
-        if not chunk_text.endswith(('.', '?', '!', ':')):
-            chunk_text += "..."
-        chunks.append(chunk_text)
-        start += max_tokens - overlap
-    logger.info("Created %d chunks", len(chunks))
-    return chunks
-    # ------------------ PARSE JSON OBJECTS FROM OUTPUT ------------------ #
-@log_execution_time()
-def extract_json_objects(text):
-    logger.info("Extracting JSON objects from text")
-    extracted = []
-    try:
-        json_start = text.index('[')
-        json_text = text[json_start:]
-    except ValueError:
-        logger.warning("⚠ '[' not found in output")
-        return []
-    # Try parsing full array first
-    try:
-        parsed = json.loads(json_text)
-        if isinstance(parsed, list):
-            return parsed
-    except Exception:
-        pass  # fallback to manual parsing
-    # Manual recovery via brace matching
-    stack = 0
-    obj_start = None
-    for i, char in enumerate(json_text):
-        if char == '{':
-            if stack == 0:
-                obj_start = i
-            stack += 1
-        elif char == '}':
-            stack -= 1
-            if stack == 0 and obj_start is not None:
-                obj_str = json_text[obj_start:i+1]
-                try:
-                    obj = json.loads(obj_str)
-                    extracted.append(obj)
-                except Exception as e:
-                    logger.error(f"❌ Invalid JSON object: {e}")
-                obj_start = None
-    return extracted
-# ------------------ PROCESS A SINGLE CHUNK ------------------ #
-@log_execution_time()
-def process_chunk(generator, chunk, idx):
-    logger.info("Processing chunk %d", idx + 1)
-    prompt = f"""
-            [INST] <<SYS>>
-            You are a clinical data extraction assistant.
-            Your job is to:
-            1. Read the following medical report.
-            2. Extract all medically relevant facts as a list of JSON objects.
-            3. Each object must include:
-            - "label": a short field name (e.g., "blood pressure", "diagnosis")
-            - "question": a question related to that field
-            - "answer": the answer from the text
-            4. After extracting the list, categorize each object under one of the following fixed categories:
-            - Patient Info
-            - Vitals
-            - Symptoms
-            - Allergies
-            - Habits
-            - Comorbidities
-            - Diagnosis
-            - Medication
-            - Laboratory
-            - Radiology
-            - Doctor Note
-             Example format for structure only — do not include in output:
-            [
-            {{
-                "label": "patient name",
-                "question": "What is the patient's name?",
-                "answer": "John Doe",
-                "category": "Patient Info"
-            }},
-            {{
-                "label": "heart rate",
-                "question": "What is the heart rate?",
-                "answer": "78 bpm",
-                "category": "Vitals"
-            }}
-            ]
-            ⚠ Use these categories listed above.If an item does not fit any of these categories, create a new category for it.
-            Text:
-            {chunk}
-            Return a single valid JSON array of all extracted objects.
-            Do not include any explanations or commentary.
-            Only output the JSON array
-            <</SYS>> [/INST]
-            """
-    try:
-        output = generator(
-            prompt,
-            max_new_tokens=1024,
-            do_sample=True,
-            temperature=0.3
-        )[0]["generated_text"]
-        print("----------------------------------")
-        logger.info(f"📤 Output from chunk {idx}: {output}...")
-        return idx, output
-    except Exception as e:
-        logger.error("Error processing chunk %d: %s", idx, e)
-        return idx, None
-# ------------------Extract Medical Data ------------------ #
-@app.route("/extract_medical_data", methods=["POST"])
-@log_execution_time()
-def extract_medical_data():
-    data = request.json
-    logger.info("Received request: %s", json.dumps(data, indent=2))
-    qa_model_name = data.get("qa_model_name")
-    qa_model_type = data.get("qa_model_type")
-    extracted_files = data.get("extracted_data")
-    if not qa_model_name or not qa_model_type:
-        return jsonify({"error": "Missing 'qa_model_name' or 'qa_model_type'"}), 400
-    if not extracted_files:
-        return jsonify({"error": "Missing 'extracted_data' in request"}), 400
-    try:
-        logger.info(f"🌀 Loading model: {qa_model_name} ({qa_model_type})")
-        model = AutoModelForCausalLM.from_pretrained(qa_model_name,  device_map="auto", torch_dtype=torch.float16, trust_remote_code=True)
-        generator = pipeline(task=qa_model_type, model=model, tokenizer=tokenizer)
-        logger.info(f"✅ Model loaded successfully: {generator.model.config._name_or_path}")
-    except Exception as e:
-        logger.error("❌ Model load failure")
-        return jsonify({"error": f"Could not load model: {str(e)}"}), 500
-    structured_response = {"extracted_data": []}
-    for file_data in extracted_files:
-        filename = file_data.get("file", "unknown_file")
-        context = file_data.get("extracted_text", "").strip()
-        logger.info("Processing file: %s", filename)
-        if not context:
-            logger.warning("No text found in file: %s", filename)
-            structured_response["extracted_data"].append(
-                {"file": filename, "medical_fields": "No data extracted"}
-            )
-            continue
-        chunks = chunk_text(context, tokenizer)
-        logger.info(f"📚 Chunked into {len(chunks)} parts for {filename}")
-        all_extracted = []
-        # for idx,chunk in enumerate(chunks):
-        #  print(f"Processing chunk {idx+1}/{len(chunks)}")
-        with ThreadPoolExecutor(max_workers=4) as executor:
-            futures = {
-                executor.submit(process_chunk, generator, chunk, idx): idx
-                for idx, chunk in enumerate(chunks)
-            }
-            for future in as_completed(futures):
-                idx = futures[future]
-                _, output = future.result()
-                if not output:
-                    continue
-                try:
-                    objs = extract_json_objects(output)
-                    if objs:
-                        all_extracted.extend(objs)
-                    else:
-                        logger.error(f"⚠ Chunk {idx+1} yielded no valid JSON.")
-                except Exception as e:
-                    logger.error(f"❌ Error extracting JSON from chunk {idx+1}")
-        # Clean and group results for this file
-        if all_extracted:
-            deduped = deduplicate_extractions(all_extracted)
-            # cleaned_json = clean_result()
-            grouped_data = group_by_category(deduped)
-        else:
-            grouped_data = {"error": "No valid data extracted"}
-        structured_response["extracted_data"].append(
-            {"file": filename, "medical_fields": grouped_data}
-        )
-        try:
-            save_data_to_storage(filename, grouped_data)
-        except Exception as e:
-            logger.error(f"⚠ Failed to save data for {filename}: {e}")
-    logger.info("✅ Extraction complete.")
-    return jsonify(structured_response)
-# -------------------------- save data to a JSON file----------------------#
-@log_execution_time()
-def save_data_to_storage(filename, data):
-    try:
-        filename = filename.rsplit(".", 1)[0]  # Remove extension
-        filepath = os.path.join(UPLOAD_FOLDER, f"{filename}.json")
-        logger.info(f"💾 Saving to: {filepath}")
-        with open(filepath, "w") as file:
-            json.dump(data, file)
-        logger.info(f"✅ Data saved successfully to {filepath}")
-    except Exception as e:
-        logger.error(f"🚨 Exception during save: {e}")
-# Function to get data from a JSON file
-# 🔍 Get data from storage
-@log_execution_time()
-def get_data_from_storage(filename):
-    try:
-        filepath = os.path.join(UPLOAD_FOLDER, f"{filename}.json")
-        logger.info(f"🔍 Looking for file at: {filepath}")
-        if not os.path.exists(filepath):
-            logger.warning(f"🚫 File not found at: {filepath}")
-            return None
-        with open(filepath, "r") as file:
-            data = json.load(file)
-        logger.info(f"✅ File found and loaded: {filepath}")
-        return data
-    except Exception as e:
-        logger.error(f"🚨 Error loading data: {e}")
-        return None
-# 🔹 Fetch updated medical data
-@app.route("/get_updated_medical_data", methods=["GET"])
-@log_execution_time()
-def get_updated_data():
-    file_name = request.args.get("file")
-    if not file_name:
-        return jsonify({"error": "File name is required"}), 400
-    # 🔥 Strip extension if present
-    file_name = file_name.rsplit(".", 1)[0]
-    # ✅ Load updated JSON data from storage
-    updated_data = get_data_from_storage(file_name)
-    if updated_data:
-        return jsonify({"file": file_name, "data": updated_data}), 200
-    else:
-        return jsonify({"error": f"File '{file_name}' not found"}), 404
-@app.route("/update_medical_data", methods=["PUT"])
-@log_execution_time()
-def update_medical_data():
-    try:
-        data = request.json
-        logger.info("Received update: %s", json.dumps(data, indent=2))
-        filename = data.get("file", "").rsplit(".", 1)[0]  # Strip extension like .pdf
-        updates = data.get("updates", [])
-        if not filename or not updates:
-            return jsonify({"error": "File name or updates missing"}), 400
-        # Load current stored data
-        existing_data = get_data_from_storage(filename)
-        if not existing_data:
-            return jsonify({"error": f"File '{filename}' not found"}), 404
-        # Loop through updates and modify categorized_data
-        for update in updates:
-            category = update.get("category")
-            field = update.get("field")
-            new_value = update.get("value")
-            updated = False
-            for extracted in existing_data.get("extracted_data", []):
-                for cat in extracted.get("categorized_data", []):
-                    if cat.get("name") == category:
-                        for fld in cat.get("fields", []):
-                            if fld.get("label") == field:
-                                logger.info("Updating [%s] %s → %s", category, field, new_value)
-                                fld["value"] = new_value
-                                updated = True
-                                break
-                    if updated:
-                        break
-                if updated:
-                    break
-        # 🧠 Sync medical_terms with categorized_data
-        for extracted in existing_data.get("extracted_data", []):
-            if "categorized_data" in extracted:
-                new_terms = {}
-                for category in extracted["categorized_data"]:
-                    for field in category.get("fields", []):
-                        label = field.get("label")
-                        value = field.get("value", "")
-                        new_terms[label] = value
-                extracted["medical_terms"] = new_terms
-                logger.info("Synced 'medical_terms' with 'categorized_data'")
-        # Save updated data to file
-        save_data_to_storage(filename, existing_data)
-        logger.info("✅ Updated data saved successfully")
-        return (
-            jsonify(
-                {"message": "Data updated successfully", "updated_data": existing_data}
-            ),
-            200,
-        )
-    except Exception as e:
-        logger.error("Update error: %s", e)
-        return jsonify({"error": str(e)}), 500
-# Test Route
-@app.route("/")
-def home():
-    return "Medical Data Extraction API is running!"
-if __name__ == "__main__":
-    app.run(host="0.0.0.0", port=5000, debug=True)
-# if __name__ == '__main__':
-#     from gevent.pywsgi import WSGIServer # type: ignore
-#     http_server = WSGIServer(('0.0.0.0', 5000), app)
-#     http_server.serve_forever()

speech_to_chart.py DELETED Viewed

@@ -1,638 +0,0 @@
-import json
-import os
-import re
-import logging
-import shutil
-from flask import Flask, request, jsonify, abort
-from werkzeug.utils import secure_filename
-from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
-import torch
-import whisper
-from dotenv import load_dotenv
-import pytesseract
-import cv2
-import pdfplumber
-import pandas as pd
-from PIL import Image
-from docx import Document
-from flask_cors import CORS
-# Load environment variables
-load_dotenv()
-# Initialize Flask app
-app = Flask(__name__)
-CORS(app)
-# Configure logging
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
-# Configure upload directory and max file size
-UPLOAD_DIR = os.getenv('UPLOAD_DIR', os.path.join(os.getcwd(), 'uploads'))
-os.makedirs(UPLOAD_DIR, exist_ok=True)
-app.config['UPLOAD_FOLDER'] = UPLOAD_DIR
-app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024  # 16 MB max file size
-# Allowed file extensions
-ALLOWED_AUDIO_EXTENSIONS = {'mp3', 'wav', 'flac'}
-ALLOWED_DOCUMENT_EXTENSIONS = {'pdf', 'jpg', 'jpeg', 'png', 'docx', 'xlsx', 'xls'}
-# Ensure ffmpeg is in PATH
-ffmpeg_path = shutil.which("ffmpeg") or "C:\\ffmpeg\\bin\\ffmpeg.exe"
-if not os.path.exists(ffmpeg_path):
-    raise RuntimeError("FFmpeg not found! Please install FFmpeg and set the correct path.")
-os.environ["PATH"] += os.pathsep + os.path.dirname(ffmpeg_path)\
-def allowed_file(filename, allowed_extensions):
-    """Check if the file extension is allowed."""
-    return '.' in filename and filename.rsplit('.', 1)[1].lower() in allowed_extensions
-class LazyModelLoader:
-    def __init__(self, model_name, task, tokenizer=None, apply_quantization=False):
-        self.model_name = model_name
-        self.task = task
-        self.tokenizer = tokenizer
-        self.apply_quantization = apply_quantization
-        self._pipeline = None
-    def load(self):
-        if self._pipeline is None:
-            logging.info(f"Loading pipeline for task: {self.task} | model: {self.model_name}")
-            if self.task == "question-answering":
-                model = AutoModelForCausalLM.from_pretrained(self.model_name)
-                tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-                if self.apply_quantization:
-                    logging.info("Applying quantization...")
-                    model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
-                self._pipeline = pipeline(self.task, model=model, tokenizer=tokenizer)
-            else:
-                self._pipeline = pipeline(self.task, model=self.model_name, tokenizer=self.tokenizer)
-        return self._pipeline
-# PHI scrubbing agent
-class PHIScrubberAgent:
-    @staticmethod
-    def scrub_phi(text):
-        try:
-            text = re.sub(r'\b(?:\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}\b', '[PHONE]', text)
-            text = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w{2,4}\b', '[EMAIL]', text)
-            text = re.sub(r'\b\d{3}-\d{2}-\d{4}\b', '[SSN]', text)
-            text = re.sub(r'\b\d{1,5}\s+\w+\s+(Street|St|Avenue|Ave|Boulevard|Blvd|Road|Rd|Lane|Ln)\b', '[ADDRESS]',
-                          text, flags=re.IGNORECASE)
-            text = re.sub(r'\bDr\.?\s+[A-Z][a-z]+\s+[A-Z][a-z]+\b', 'Dr. [NAME]', text)
-            text = re.sub(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', '[NAME]', text)
-        except Exception as e:
-            logging.error(f"PHI scrubbing failed: {e}")
-        return text
-# Summarization Agent
-class SummarizerAgent:
-    def __init__(self, summarization_model_loader):
-        self.summarization_model_loader = summarization_model_loader
-    def generate_summary(self, text):
-        model = self.summarization_model_loader.load()
-        try:
-            summary_result = model(text, max_length=150, min_length=30, do_sample=False)
-            return summary_result[0]['summary_text'].strip()
-        except Exception as e:
-            logging.error(f"Summary generation failed: {e}")
-            return "Summary generation failed."
-# Medical Data Extraction Agent
-class MedicalDataExtractorAgent:
-    def __init__(self, gen_model_loader):
-        self.gen_model_loader = gen_model_loader
-    def extract_medical_data(self, text):
-        try:
-            generator = self.gen_model_loader.load()
-            prompt = (
-                "Extract structured medical information from the following clinical note.\n\n"
-                "Return the result in JSON format with the following fields:\n"
-                "patient_condition, symptoms, current_problems, allergies, dr_notes, "
-                "prescription, investigations, follow_up_instructions.\n\n"
-                f"Clinical Note:\n{text}\n\n"
-                "Structured JSON Output:\n"
-            )
-            response = generator(prompt, max_new_tokens=256)[0]["generated_text"]
-            logging.debug(f"Raw model output: {response}")
-            json_start = response.find("{")
-            json_end = response.rfind("}") + 1
-            if json_start == -1 or json_end == -1:
-                raise ValueError("No JSON found in the model response.")
-            json_str = response[json_start:json_end]
-            return json.loads(json_str)
-        except Exception as e:
-            logging.error(f"Error extracting medical data: {e}")
-            return {"error": f"Failed to extract medical data: {str(e)}"}
-# Initialize lazy loaders
-gen_model_loader = LazyModelLoader(
-    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "text-generation",
-)
-summarization_model_loader = LazyModelLoader("google-t5/t5-large", "summarization", apply_quantization=True)
-whisper_model = whisper.load_model("base")
-# Initialize agents
-phi_scrubber_agent = PHIScrubberAgent()
-medical_data_extractor_agent = MedicalDataExtractorAgent(gen_model_loader)
-summarizer_agent = SummarizerAgent(summarization_model_loader)
-# API Endpoints
-@app.route('/api/extract_medical_data', methods=['POST'])
-def extract_medical_data():
-    try:
-        data = request.json
-        if "text" not in data or not data["text"].strip():
-            return jsonify({"error": "No valid text provided"}), 400
-        raw_text = data["text"]
-        clean_text = phi_scrubber_agent.scrub_phi(raw_text)
-        structured_data = medical_data_extractor_agent.extract_medical_data(clean_text)
-        return jsonify(structured_data), 200
-    except Exception as e:
-        logging.error(f"Failed to extract medical data: {e}")
-        return jsonify({"error": f"Extraction Error: {str(e)}"}), 500
-@app.route('/api/transcribe', methods=['POST'])
-def transcribe_audio():
-    if 'audio' not in request.files:
-        abort(400, description="No audio file provided")
-    audio_file = request.files['audio']
-    if not allowed_file(audio_file.filename, ALLOWED_AUDIO_EXTENSIONS):
-        abort(400, description="Invalid file type. Only mp3, wav, and flac files are allowed.")
-    filename = secure_filename(audio_file.filename)
-    audio_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
-    audio_file.save(audio_path)
-    try:
-        result = whisper_model.transcribe(audio_path)
-        transcribed_text = result["text"]
-        os.remove(audio_path)
-        return jsonify({"transcribed_text": transcribed_text}), 200
-    except Exception as e:
-        logging.error(f"Transcription failed: {str(e)}")
-        return jsonify({"error": f"Transcription failed: {str(e)}"}), 500
-@app.route('/api/generate_summary', methods=['POST'])
-def generate_summary():
-    data = request.json
-    if "text" not in data or not data["text"].strip():
-        return jsonify({"error": "No valid text provided"}), 400
-    context = data["text"]
-    clean_text = phi_scrubber_agent.scrub_phi(context)
-    summary = summarizer_agent.generate_summary(clean_text)
-    return jsonify({"summary": summary}), 200
-@app.route('/api/extract_medical_data_from_audio', methods=['POST'])
-def extract_medical_data_from_audio():
-    if 'audio' not in request.files:
-        abort(400, description="No audio file provided")
-    audio_file = request.files['audio']
-    if not allowed_file(audio_file.filename, ALLOWED_AUDIO_EXTENSIONS):
-        abort(400, description="Invalid file type. Only mp3, wav, and flac files are allowed.")
-    filename = secure_filename(audio_file.filename)
-    audio_path = os.path.join(UPLOAD_DIR, filename)
-    audio_file.save(audio_path)
-    try:
-        result = whisper_model.transcribe(audio_path)
-        transcribed_text = result["text"]
-        clean_text = phi_scrubber_agent.scrub_phi(transcribed_text)
-        summary = summarizer_agent.generate_summary(clean_text)
-        structured_data = medical_data_extractor_agent.extract_medical_data(clean_text)
-        response = {
-            "transcribed_text": clean_text,
-            "summary": summary,
-            "medical_chart": structured_data
-        }
-        os.remove(audio_path)
-        return jsonify(response), 200
-    except Exception as e:
-        logging.error(f"Processing failed: {str(e)}")
-        return jsonify({"error": f"Processing failed: {str(e)}"}), 500
-if __name__ == '__main__':
-    app.run(host='0.0.0.0', port=5000, debug=False)
-# import json
-# import os
-# import re
-# import logging
-# import shutil
-# from dotenv import load_dotenv
-# from flask import Flask, request, jsonify, abort
-# from werkzeug.utils import secure_filename
-# from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
-# import pytesseract
-# import cv2
-# import pdfplumber
-# import pandas as pd
-# from PIL import Image
-# from docx import Document
-# from flask_cors import CORS
-# from flask_executor import Executor
-# from sentence_transformers import SentenceTransformer
-# import faiss
-# import whisper
-# from PyPDF2 import PdfReader
-# from pdf2image import convert_from_path
-# from concurrent.futures import ThreadPoolExecutor
-# import tempfile
-# # Load environment variables
-# load_dotenv()
-# # Initialize Flask app
-# app = Flask(__name__)
-# CORS(app)
-# # Configure logging
-# logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
-# # Configure upload directory and max file size
-# UPLOAD_DIR = os.getenv('UPLOAD_DIR', os.path.join(os.getcwd(), 'uploads'))
-# os.makedirs(UPLOAD_DIR, exist_ok=True)
-# app.config['UPLOAD_FOLDER'] = UPLOAD_DIR
-# app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024  # 16 MB max file size
-# # Initialize Flask-Executor for asynchronous tasks
-# executor = Executor(app)
-# whisper_model = whisper.load_model("tiny")
-# # Allowed file extensions
-# ALLOWED_AUDIO_EXTENSIONS = {'mp3', 'wav', 'flac'}
-# ALLOWED_DOCUMENT_EXTENSIONS = {'pdf', 'jpg', 'jpeg', 'png', 'docx', 'xlsx', 'xls'}
-# # Ensure ffmpeg is in PATH
-# ffmpeg_path = shutil.which("ffmpeg") or "C:\\ffmpeg\\bin\\ffmpeg.exe"
-# if not os.path.exists(ffmpeg_path):
-#     raise RuntimeError("FFmpeg not found! Please install FFmpeg and set the correct path.")
-# os.environ["PATH"] += os.pathsep + os.path.dirname(ffmpeg_path)
-# # Lazy model loading to save resources
-# class LazyModelLoader:
-#     def __init__(self, model_name, task, tokenizer=None):
-#         self.model_name = model_name
-#         self.task = task
-#         self.tokenizer = tokenizer
-#         self._model = None
-#     def load(self):
-#         """Load the model if not already loaded."""
-#         if self._model is None:
-#             logging.info(f"Loading model: {self.model_name}")
-#             if self.task == "text-generation":
-#                 self._model = AutoModelForCausalLM.from_pretrained(
-#                     self.model_name, device_map="auto", torch_dtype="auto"
-#                 )
-#                 self._tokenizer = AutoTokenizer.from_pretrained(self.model_name, legacy=False)
-#                 # Set pad_token_id if it's not already set
-#                 if self._model.generation_config.pad_token_id is None or self._model.generation_config.pad_token_id < 0:
-#                     if self._tokenizer.eos_token_id is not None:
-#                         self._model.generation_config.pad_token_id = self._tokenizer.eos_token_id
-#                         logging.info(f"Set pad_token_id to {self._tokenizer.eos_token_id}")
-#                     else:
-#                         logging.warning("No valid eos_token_id found. Setting pad_token_id to 0 as a fallback.")
-#                         self._model.generation_config.pad_token_id = 0
-#             else:
-#                 self._model = pipeline(self.task, model=self.model_name, tokenizer=self.tokenizer)
-#         return self._model
-# # Text extraction agents
-# class TextExtractorAgent:
-#     @staticmethod
-#     def extract_text(filepath, ext):
-#         """Extract text based on file type."""
-#         try:
-#             if ext == "pdf":
-#                 return TextExtractorAgent.extract_text_from_pdf(filepath)
-#             elif ext in {"jpg", "jpeg", "png"}:
-#                 return TextExtractorAgent.extract_text_from_image(filepath)
-#             elif ext == "docx":
-#                 return TextExtractorAgent.extract_text_from_docx(filepath)
-#             elif ext in {"xlsx", "xls"}:
-#                 return TextExtractorAgent.extract_text_from_excel(filepath)
-#             return None
-#         except Exception as e:
-#             logging.error(f"Text extraction failed: {e}")
-#             return None
-#     @staticmethod
-#     def extract_text_from_pdf(filepath):
-#         """Extract text from a PDF file."""
-#         text = ""
-#         with pdfplumber.open(filepath) as pdf:
-#             for page in pdf.pages:
-#                 page_text = page.extract_text()
-#                 if page_text:
-#                     text += page_text + "\n"
-#         return text.strip() or None
-#     @staticmethod
-#     def extract_text_from_image(filepath):
-#         """Extract text from an image using OCR."""
-#         image = cv2.imread(filepath)
-#         gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-#         _, processed = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
-#         with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
-#             processed_path = temp_file.name
-#         cv2.imwrite(processed_path, processed)
-#         text = pytesseract.image_to_string(Image.open(processed_path), lang='eng')
-#         os.remove(processed_path)
-#         return text.strip() or None
-#     @staticmethod
-#     def extract_text_from_docx(filepath):
-#         """Extract text from a DOCX file."""
-#         doc = Document(filepath)
-#         text = "\n".join([para.text for para in doc.paragraphs])
-#         return text.strip() or None
-#     @staticmethod
-#     def extract_text_from_excel(filepath):
-#         """Extract text from an Excel file."""
-#         dfs = pd.read_excel(filepath, sheet_name=None)
-#         text = "\n".join([
-#             "\n".join([
-#                 " ".join(map(str, df[col].dropna()))
-#                 for col in df.columns
-#             ])
-#             for df in dfs.values()
-#         ])
-#         return text.strip() or None
-# # PHI scrubbing agent
-# class PHIScrubberAgent:
-#     @staticmethod
-#     def scrub_phi(text):
-#         """Remove sensitive personal health information (PHI)."""
-#         try:
-#             text = re.sub(r'\b(?:\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}\b', '[PHONE]', text)
-#             text = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w{2,4}\b', '[EMAIL]', text)
-#             text = re.sub(r'\b\d{3}-\d{2}-\d{4}\b', '[SSN]', text)
-#             text = re.sub(r'\b\d{1,5}\s+\w+\s+(Street|St|Avenue|Ave|Boulevard|Blvd|Road|Rd|Lane|Ln)\b', '[ADDRESS]', text, flags=re.IGNORECASE)
-#             text = re.sub(r'\bDr\.?\s+[A-Z][a-z]+\s+[A-Z][a-z]+\b', 'Dr. [NAME]', text)
-#             text = re.sub(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', '[NAME]', text)
-#         except Exception as e:
-#             logging.error(f"PHI scrubbing failed: {e}")
-#         return text
-# # Summarization agent
-# class SummarizerAgent:
-#     def __init__(self, summarization_model_loader):
-#         self.summarization_model_loader = summarization_model_loader
-#     def generate_summary(self, text):
-#         """Generate a summary of the provided text."""
-#         model = self.summarization_model_loader.load()
-#         try:
-#             summary_result = model(text, do_sample=False)
-#             return summary_result[0]['summary_text'].strip()
-#         except Exception as e:
-#             logging.error(f"Summary generation failed: {e}")
-#             return "Summary generation failed."
-# def allowed_file(filename, allowed_extensions):
-#     """Check if the file extension is allowed."""
-#     return '.' in filename and filename.rsplit('.', 1)[1].lower() in allowed_extensions
-# # Knowledge Base
-# class KnowledgeBase:
-#     def __init__(self, documents):
-#         self.embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
-#         self.documents = documents
-#         self.embeddings = self.embedding_model.encode(documents)
-#         self.dimension = self.embedding_model.get_sentence_embedding_dimension()
-#         self.index = faiss.IndexFlatL2(self.dimension)
-#         self.index.add(self.embeddings)
-#     def retrieve_relevant_info(self, query, top_k=3):
-#         """Retrieve relevant medical information from the knowledge base."""
-#         query_embedding = self.embedding_model.encode([query])
-#         distances, indices = self.index.search(query_embedding, top_k)
-#         relevant_texts = [self.documents[i] for i in indices[0]]
-#         return relevant_texts
-# # Medical data extraction agent
-# class MedicalDataExtractorAgent:
-#     def __init__(self, model_loader, knowledge_base):
-#         self.model_loader = model_loader
-#         self.knowledge_base = knowledge_base
-#     def retrieve_relevant_info(self, query, top_k=3):
-#         """Retrieve relevant medical information from the knowledge base."""
-#         query_embedding = self.knowledge_base.embedding_model.encode([query])
-#         distances, indices = self.knowledge_base.index.search(query_embedding, top_k)
-#         relevant_texts = [self.knowledge_base.documents[i] for i in indices[0]]
-#         return relevant_texts
-#     def extract_medical_data(self, text):
-#         """Extract structured medical data from text using Agentic RAG."""
-#         try:
-#             # Define the default JSON schema
-#             default_schema = {
-#                 "patient_name": "[NAME]",
-#                 "age": None,
-#                 "gender": None,
-#                 "diagnosis": [],
-#                 "symptoms": [],
-#                 "medications": [],
-#                 "allergies": [],
-#                 "vitals": {
-#                     "blood_pressure": None,
-#                     "heart_rate": None,
-#                     "temperature": None
-#                 },
-#                 "notes": ""
-#             }
-#             # Construct the prompt with the input text
-#             prompt = f"""
-#             ### Instruction:
-#             Extract structured medical data from the following text as a JSON whose parameters are enclosed in "" and without any \.
-#             The JSON should include patientname, age, gender, medications, allergies, diagnosis, symptoms, vitals, and notes.
-#             ### Text:
-#             {text}
-#             ### Response:
-#             """
-#             # Tokenize and generate the response
-#             model = self.model_loader.load()
-#             tokenizer = self.model_loader._tokenizer
-#             inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
-#             outputs = model.generate(
-#                 inputs.input_ids,
-#                 num_return_sequences=1,
-#                 temperature=0.7,
-#                 top_p=0.9,
-#                 do_sample=True
-#             )
-#             response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-#             logging.info(f"Model response: {response}")
-#             # Parse and normalize the JSON output
-#             json_start = response.find("{")
-#             json_end = response.rfind("}") + 1
-#             if json_start == -1 or json_end == -1:
-#                 raise ValueError("No JSON found in the model response.")
-#             # Extract the JSON substring
-#             structured_data = json.loads(response[json_start:json_end])
-#             # Normalize the JSON output
-#             normalized_data = self.normalize_json_output(structured_data, default_schema)
-#             # Ensure blood pressure is a string
-#             if normalized_data["vitals"]["blood_pressure"] and isinstance(normalized_data["vitals"]["blood_pressure"], str):
-#                 normalized_data["vitals"]["blood_pressure"] = normalized_data["vitals"]["blood_pressure"].strip('"')
-#             return json.dumps(normalized_data)
-#         except json.JSONDecodeError as e:
-#             logging.error(f"JSON parsing failed: {e}")
-#             return json.dumps({"error": f"Failed to parse JSON: {str(e)}"})
-#         except Exception as e:
-#             logging.error(f"Error extracting medical data: {e}")
-#             return json.dumps({"error": f"Failed to extract medical data: {str(e)}"})
-#     @staticmethod
-#     def normalize_json_output(model_output, default_schema):
-#         """
-#         Normalize the model's JSON output to match the default schema.
-#         """
-#         try:
-#             normalized_output = default_schema.copy()
-#             for key in normalized_output:
-#                 if key in model_output:
-#                     normalized_output[key] = model_output[key]
-#             return normalized_output
-#         except Exception as e:
-#             logging.error(f"Failed to normalize JSON: {e}")
-#             return default_schema  # Return the default schema in case of errors
-# # Initialize lazy loaders
-# medalpaca_model_loader = LazyModelLoader("lmsys/vicuna-7b-v1.5", "text-generation")
-# summarization_model_loader = LazyModelLoader("google-t5/t5-small", "summarization")
-# whisper_model = whisper.load_model("tiny")
-# # Initialize knowledge base
-# medical_documents = [
-#     "Hypertension is a chronic condition characterized by elevated blood pressure.",
-#     "Diabetes is a metabolic disorder that affects blood sugar levels.",
-#     "Common symptoms of chest pain include pressure, tightness, or discomfort in the chest."
-# ]
-# knowledge_base = KnowledgeBase(medical_documents)
-# # Initialize agents
-# text_extractor_agent = TextExtractorAgent()
-# phi_scrubber_agent = PHIScrubberAgent()
-# medical_data_extractor_agent = MedicalDataExtractorAgent(medalpaca_model_loader, knowledge_base)
-# summarizer_agent = SummarizerAgent(summarization_model_loader)
-# # API Endpoints
-# @app.route('/api/extract_medical_data', methods=['POST'])
-# def extract_medical_data():
-#     """Extract structured medical data from raw text."""
-#     try:
-#         data = request.json
-#         if "text" not in data or not data["text"].strip():
-#             return jsonify({"error": "No valid text provided"}), 400
-#         raw_text = data["text"]
-#         clean_text = phi_scrubber_agent.scrub_phi(raw_text)
-#         structured_data = medical_data_extractor_agent.extract_medical_data(clean_text)
-#         return jsonify(json.loads(structured_data)), 200
-#     except Exception as e:
-#         logging.error(f"Failed to extract medical data: {e}")
-#         return jsonify({"error": f"Extraction Error: {str(e)}"}), 500
-# @app.route('/api/transcribe', methods=['POST'])
-# def transcribe_audio():
-#     """Transcribe audio files into text."""
-#     if 'audio' not in request.files:
-#         abort(400, description="No audio file provided")
-#     audio_file = request.files['audio']
-#     if not allowed_file(audio_file.filename, ALLOWED_AUDIO_EXTENSIONS):
-#         abort(400, description="Invalid file type. Only mp3, wav, and flac files are allowed.")
-#     filename = secure_filename(audio_file.filename)
-#     audio_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
-#     audio_file.save(audio_path)
-#     try:
-#         result = whisper_model.transcribe(audio_path)
-#         transcribed_text = result["text"]
-#         os.remove(audio_path)
-#         return jsonify({"transcribed_text": transcribed_text}), 200
-#     except Exception as e:
-#         logging.error(f"Transcription failed: {str(e)}")
-#         return jsonify({"error": f"Transcription failed: {str(e)}"}), 500
-# @app.route('/api/generate_summary', methods=['POST'])
-# def generate_summary():
-#     """Generate a summary from the provided text."""
-#     data = request.json
-#     if "text" not in data or not data["text"].strip():
-#         return jsonify({"error": "No valid text provided"}), 400
-#     context = data["text"]
-#     clean_text = phi_scrubber_agent.scrub_phi(context)
-#     summary = summarizer_agent.generate_summary(clean_text)
-#     return jsonify({"summary": summary}), 200
-# @app.route('/api/extract_medical_data_from_audio', methods=['POST'])
-# def extract_medical_data_from_audio():
-#     """Extract medical data from transcribed audio."""
-#     if 'audio' not in request.files:
-#         abort(400, description="No audio file provided")
-#     audio_file = request.files['audio']
-#     if not allowed_file(audio_file.filename, ALLOWED_AUDIO_EXTENSIONS):
-#         abort(400, description="Invalid file type. Only mp3, wav, and flac files are allowed.")
-#     filename = secure_filename(audio_file.filename)
-#     audio_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
-#     audio_file.save(audio_path)
-#     try:
-#         result = whisper_model.transcribe(audio_path)
-#         transcribed_text = result["text"]
-#         clean_text = phi_scrubber_agent.scrub_phi(transcribed_text)
-#         summary = summarizer_agent.generate_summary(transcribed_text)
-#         structured_data = medical_data_extractor_agent.extract_medical_data(transcribed_text)
-#         response = {
-#             "transcribed_text": transcribed_text,
-#             "summary": summary,
-#             "medical_chart": json.loads(structured_data)
-#         }
-#         os.remove(audio_path)
-#         return jsonify(response), 200
-#     except Exception as e:
-#         logging.error(f"Processing failed: {str(e)}")
-#         return jsonify({"error": f"Processing failed: {str(e)}"}), 500
-# @app.route('/upload_document', methods=['POST'])
-# def upload_document():
-#     """Upload and extract text from documents."""
-#     if 'file' not in request.files:
-#         return jsonify({"error": "No file uploaded"}), 400
-#     file = request.files['file']
-#     if file.filename == '':
-#         return jsonify({"error": "No file selected"}), 400
-#     if file and allowed_file(file.filename, ALLOWED_DOCUMENT_EXTENSIONS):
-#         filename = secure_filename(file.filename)
-#         filepath = os.path.join(UPLOAD_DIR, filename)
-#         file.save(filepath)
-#         ext = filename.rsplit('.', 1)[1].lower()
-#         extracted_text = text_extractor_agent.extract_text(filepath, ext)
-#         if not extracted_text:
-#             return jsonify({"error": "No text found in file."}), 400
-#         response_data = {
-#             "file": filename,
-#             "extracted_text": extracted_text[:500],
-#             "message": "Click to extract medical terms"
-#         }
-#         os.remove(filepath)
-#         return jsonify(response_data), 200
-#     return jsonify({"error": "Invalid file type"}), 400
-# @app.route('/extract_medical_data_from_document', methods=['POST'])
-# def extract_medical_data_from_document():
-#     """Extract medical data from document text."""
-#     data = request.json
-#     if "text" not in data or not data["text"].strip():
-#         return jsonify({"error": "No valid text provided"}), 400
-#     context = data["text"]
-#     clean_text = phi_scrubber_agent.scrub_phi(context)
-#     structured_data = medical_data_extractor_agent.extract_medical_data(clean_text)
-#     return jsonify(json.loads(structured_data)), 200
-# if __name__ == '__main__':
-#     app.run(host='0.0.0.0', port=5000, debug=True)