Spaces:

philipk22
/

hf-indassistant

Sleeping

App Files Files Community

philipk22 commited on Feb 15

Commit

2eeebbc

1 Parent(s): eee67e7

Initial add from the remote

Browse files

Files changed (10) hide show

.gitignore +2 -0
IND-312.pdf +0 -0
README.md +5 -11
ind_checklist_stlit.py +144 -0
preprocessed_docs.json +0 -0
requirements.txt +11 -0
streamlit_app.py +65 -0
submission_assessment.py +346 -0
submission_assessment0.py +324 -0
template.md +72 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__/
2	+ .cache/

IND-312.pdf ADDED Viewed

Binary file (423 kB). View file

README.md CHANGED Viewed

@@ -1,12 +1,6 @@
----
-title: Hf Indassistant
-emoji: 🦀
-colorFrom: gray
-colorTo: yellow
 sdk: streamlit
-sdk_version: 1.42.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+title: IND Assistant Application
+emoji: 📚
+colorFrom: blue
+colorTo: green
 sdk: streamlit
+app_port: 8860

ind_checklist_stlit.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import os
+import json
+from typing import List
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import Qdrant
+from langchain_community.embeddings import HuggingFaceBgeEmbeddings
+from langchain_openai.chat_models import ChatOpenAI
+from langchain.prompts import ChatPromptTemplate
+from langchain.schema.runnable import RunnablePassthrough
+from langchain_core.output_parsers import StrOutputParser
+from operator import itemgetter
+import nest_asyncio
+from langchain.schema import Document
+# Apply nest_asyncio for async operations
+nest_asyncio.apply()
+# Set environment variables for API keys
+os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")  # OpenAI API Key
+os.environ["LLAMA_CLOUD_API_KEY"] = os.getenv("LLAMA_CLOUD_API_KEY")  # Llama Cloud API Key
+# File paths
+PDF_FILE = "IND-312.pdf"
+PREPROCESSED_FILE = "preprocessed_docs.json"
+# Load and parse PDF (only for preprocessing)
+def load_pdf(pdf_path: str) -> List[Document]:
+    """Loads a PDF, processes it with LlamaParse, and splits it into LangChain documents."""
+    from llama_parse import LlamaParse  # Import only if needed
+    file_size = os.path.getsize(pdf_path) / (1024 * 1024)  # Size in MB
+    workers = 2 if file_size > 2 else 1  # Use 2 workers for PDFs >2MB
+    parser = LlamaParse(
+        api_key=os.environ["LLAMA_CLOUD_API_KEY"],
+        result_type="markdown",
+        num_workers=workers,
+        verbose=True
+    )
+    # Parse PDF to documents
+    llama_documents = parser.load_data(pdf_path)
+    # Convert to LangChain documents
+    documents = [
+        Document(
+            page_content=doc.text,
+            metadata={"source": pdf_path, "page": doc.metadata.get("page_number", 0)}
+        ) for doc in llama_documents
+    ]
+    # Split documents into chunks
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=500,
+        chunk_overlap=50,
+        length_function=len,
+    )
+    return text_splitter.split_documents(documents)
+# Preprocess the PDF and save to JSON (Only if it doesn't exist)
+def preprocess_pdf(pdf_path: str, output_path: str = PREPROCESSED_FILE):
+    """Preprocess PDF only if the output file does not exist."""
+    if os.path.exists(output_path):
+        print(f"Preprocessed data already exists at {output_path}. Skipping PDF processing.")
+        return  # Skip processing if file already exists
+    print("Processing PDF for the first time...")
+    documents = load_pdf(pdf_path)  # Load and process the PDF
+    # Convert documents to JSON format
+    json_data = [{"content": doc.page_content, "metadata": doc.metadata} for doc in documents]
+    # Save to file
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(json_data, f, indent=4)
+    print(f"Preprocessed PDF saved to {output_path}")
+# Load preprocessed data instead of parsing PDF
+def load_preprocessed_data(json_path: str) -> List[Document]:
+    """Load preprocessed data from JSON."""
+    if not os.path.exists(json_path):
+        raise FileNotFoundError(f"Preprocessed file {json_path} not found. Run preprocessing first.")
+    with open(json_path, "r", encoding="utf-8") as f:
+        json_data = json.load(f)
+    return [Document(page_content=d["content"], metadata=d["metadata"]) for d in json_data]
+# Initialize vector store from preprocessed data
+def init_vector_store(documents: List[Document]):
+    """Initialize a vector store using HuggingFace embeddings and Qdrant."""
+    if not documents or not all(doc.page_content.strip() for doc in documents):
+        raise ValueError("No valid documents found for vector storage")
+    # Initialize embedding model
+    embedding_model = HuggingFaceBgeEmbeddings(
+        model_name="BAAI/bge-base-en-v1.5",
+        encode_kwargs={'normalize_embeddings': True}
+    )
+    return Qdrant.from_documents(
+        documents=documents,
+        embedding=embedding_model,
+        location=":memory:",
+        collection_name="ind312_docs",
+        force_recreate=False
+    )
+# Create RAG chain for retrieval-based Q&A
+def create_rag_chain(retriever):
+    """Create a retrieval-augmented generation (RAG) chain for answering questions."""
+    # Load prompt template
+    with open("template.md") as f:
+        template_content = f.read()
+    prompt = ChatPromptTemplate.from_template("""
+    You are an FDA regulatory expert. Use this structure for checklists:
+    {template}
+    Context from IND-312:
+    {context}
+    Question: {question}
+    Answer in Markdown with checkboxes (- [ ]). If unsure, say "I can only answer IND related questions.".
+    """)
+    return (
+        {
+            "context": itemgetter("question") | retriever,
+            "question": itemgetter("question"),
+            "template": lambda _: template_content  # Inject template content
+        }
+        | RunnablePassthrough.assign(context=itemgetter("context"))
+        | {"response": prompt | ChatOpenAI(model="gpt-4") | StrOutputParser()}
+    )
+# Run preprocessing only if executed directly (NOT when imported)
+if __name__ == "__main__":
+    preprocess_pdf(PDF_FILE)

preprocessed_docs.json ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+openai>=1.0.0
+langchain>=0.0.148
+langchain-openai>=0.0.1
+langchain-community>=0.1.0
+streamlit>=1.32.0
+qdrant-client>=0.3.0
+llama-parse>=0.0.1
+nest-asyncio>=1.5.6
+torch>=2.0.0
+sentence-transformers>=2.2.2
+langgraph>=0.1.0

streamlit_app.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import os
+import json
+import streamlit as st
+from ind_checklist_stlit import load_preprocessed_data, init_vector_store, create_rag_chain
+# Prevent Streamlit from auto-reloading on file changes
+os.environ["STREAMLIT_WATCHER_TYPE"] = "none"
+# Define the preprocessed file path
+PREPROCESSED_FILE = "preprocessed_docs.json"
+# Caching function to prevent redundant RAG processing
+@st.cache_data
+def cached_response(question: str):
+    """Retrieve cached response if available, otherwise compute response."""
+    return st.session_state.rag_chain.invoke({"question": question})["response"]
+def main():
+    st.title("Appian IND Application Assistant")
+    st.markdown("Chat about Investigational New Drug Applications")
+    # Button to clear chat history
+    if st.button("Clear Chat History"):
+        st.session_state.messages = []
+        st.rerun()
+    # Initialize session state
+    if "messages" not in st.session_state:
+        st.session_state.messages = []
+    # Load preprocessed data and initialize the RAG chain
+    if "rag_chain" not in st.session_state:
+        if not os.path.exists(PREPROCESSED_FILE):
+            st.error(f"❌ Preprocessed file '{PREPROCESSED_FILE}' not found. Please run preprocessing first.")
+            return  # Stop execution if preprocessed data is missing
+        with st.spinner("🔄 Initializing knowledge base..."):
+            documents = load_preprocessed_data(PREPROCESSED_FILE)
+            vectorstore = init_vector_store(documents)
+            st.session_state.rag_chain = create_rag_chain(vectorstore.as_retriever())
+    # Display chat history
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"]):
+            st.markdown(message["content"])
+    # Chat input and response handling
+    if prompt := st.chat_input("Ask about IND requirements"):
+        st.session_state.messages.append({"role": "user", "content": prompt})
+        # Display user message
+        with st.chat_message("user"):
+            st.markdown(prompt)
+        # Generate response (cached if already asked before)
+        with st.chat_message("assistant"):
+            response = cached_response(prompt)
+            st.markdown(response)
+        # Store bot response in chat history
+        st.session_state.messages.append({"role": "assistant", "content": response})
+if __name__ == "__main__":
+    main()

submission_assessment.py ADDED Viewed

	@@ -0,0 +1,346 @@

+"""
+Submission Assessment Module
+This module implements a LangGraph agentic pipeline to perform
+cross-reference of an uploaded submission package (ZIP file) against a predefined
+IND checklist. It supports processing of both PDF (using LlamaParse in the
+pre-agent phase) and text files.
+A Streamlit interface is provided to allow users to upload a ZIP file and view the assessment report.
+"""
+import os
+import io
+import tempfile
+from zipfile import ZipFile
+import streamlit as st
+from llama_parse import LlamaParse
+import pickle
+import hashlib
+# Access API key from environment variable
+LLAMA_CLOUD_API_KEY = os.environ.get("LLAMA_CLOUD_API_KEY")
+# Check if the API key is available
+if not LLAMA_CLOUD_API_KEY:
+    st.error("LLAMA_CLOUD_API_KEY not found in environment variables. Please set it in your Hugging Face Space secrets.")
+    st.stop()
+# Sample Checklist Configuration (this should be adjusted to your actual IND requirements)
+IND_CHECKLIST = {
+    "Investigator Brochure": {
+        "file_patterns": ["brochure", "ib"],
+        "required_keywords": ["pharmacology", "toxicology", "clinical data"]
+    },
+    "Clinical Protocol": {
+        "file_patterns": ["clinical", "protocol"],
+        "required_keywords": ["study design", "objectives", "patient population", "dosing regimen", "endpoints"]
+    },
+    "Form FDA-1571": {
+        "file_patterns": ["1571", "fda-1571"],
+        "required_keywords": [
+            # Sponsor Information
+            "Name of Sponsor",
+            "Date of Submission",
+            "Address 1",
+            "Sponsor Telephone Number",
+            # Drug Information
+            "Name of Drug",
+            "IND Type",
+            "Proposed Indication for Use",
+            # Regulatory Information
+            "Phase of Clinical Investigation",
+            "Serial Number",
+            # Application Contents
+            "Table of Contents",
+            "Investigator's Brochure",
+            "Study protocol",
+            "Investigator data",
+            "Facilities data",
+            "Institutional Review Board data",
+            "Environmental assessment",
+            "Pharmacology and Toxicology",
+            # Signatures and Certifications
+            #"Person Responsible for Clinical Investigation Monitoring",
+            #"Person Responsible for Reviewing Safety Information",
+            "Sponsor or Sponsor's Authorized Representative First Name",
+            "Sponsor or Sponsor's Authorized Representative Last Name",
+            "Sponsor or Sponsor's Authorized Representative Title",
+            "Sponsor or Sponsor's Authorized Representative Telephone Number",
+            "Date of Sponsor's Signature"
+        ]
+    }
+}
+class ChecklistCrossReferenceAgent:
+    """
+    Agent that cross-references the pre-parsed submission package data
+    against a predefined IND checklist.
+    Input:
+        submission_data: list of dicts representing each file with keys:
+            - "filename": Filename of the document.
+            - "file_type": e.g., "pdf" or "txt"
+            - "content": Extracted text from the document.
+            - "metadata": (Optional) Additional metadata.
+        checklist: dict representing the IND checklist.
+    Output:
+        A mapping of checklist items to their verification status.
+    """
+    def __init__(self, checklist):
+        self.checklist = checklist
+    def run(self, submission_data):
+        cross_reference_result = {}
+        for document_name, config in self.checklist.items():
+            file_patterns = config.get("file_patterns", [])
+            required_keywords = config.get("required_keywords", [])
+            matched_file = None
+            # Attempt to find a matching file based on filename patterns.
+            for file_info in submission_data:
+                filename = file_info.get("filename", "").lower()
+                if any(pattern.lower() in filename for pattern in file_patterns):
+                    matched_file = file_info
+                    break
+            # Build the result per checklist item.
+            if not matched_file:
+                # File is completely missing.
+                cross_reference_result[document_name] = {
+                    "status": "missing",
+                    "missing_fields": required_keywords
+                }
+            else:
+                # File found, check if its content includes the required keywords.
+                content = matched_file.get("content", "").lower()
+                missing_fields = []
+                for keyword in required_keywords:
+                    if keyword.lower() not in content:
+                        missing_fields.append(keyword)
+                if missing_fields:
+                    cross_reference_result[document_name] = {
+                        "status": "incomplete",
+                        "missing_fields": missing_fields
+                    }
+                else:
+                    cross_reference_result[document_name] = {
+                        "status": "present",
+                        "missing_fields": []
+                    }
+        return cross_reference_result
+class AssessmentRecommendationAgent:
+    """
+    Agent that analyzes the cross-reference data and produces an
+    assessment report with recommendations.
+    Input:
+        cross_reference_result: dict mapping checklist items to their status.
+    Output:
+        A dict containing an overall compliance flag and detailed recommendations.
+    """
+    def run(self, cross_reference_result):
+        recommendations = {}
+        overall_compliant = True
+        for doc, result in cross_reference_result.items():
+            status = result.get("status")
+            if status == "missing":
+                recommendations[doc] = f"{doc} is missing. Please include the document."
+                overall_compliant = False
+            elif status == "incomplete":
+                missing = ", ".join(result.get("missing_fields", []))
+                recommendations[doc] = (f"{doc} is incomplete. Missing required fields: {missing}. "
+                                        "Please update accordingly.")
+                overall_compliant = False
+            else:
+                recommendations[doc] = f"{doc} is complete."
+        assessment = {
+            "overall_compliant": overall_compliant,
+            "recommendations": recommendations
+        }
+        return assessment
+class OutputFormatterAgent:
+    """
+    Agent that formats the assessment report into a user-friendly format.
+    This example formats the output as Markdown.
+    Input:
+        assessment: dict output from AssessmentRecommendationAgent.
+    Output:
+        A formatted string report.
+    """
+    def run(self, assessment):
+        overall = "Compliant" if assessment.get("overall_compliant") else "Non-Compliant"
+        lines = []
+        lines.append("# Submission Package Assessment Report")
+        lines.append(f"**Overall Compliance:** {overall}\n")
+        recommendations = assessment.get("recommendations", {})
+        for doc, rec in recommendations.items():
+            lines.append(f"### {doc}")
+            # Format recommendations as bullet points
+            if "incomplete" in rec.lower():
+                missing_fields = rec.split("Missing required fields: ")[1].split(".")[0].split(", ")
+                lines.append("- Status: Incomplete")
+                lines.append("  - Missing Fields:")
+                for field in missing_fields:
+                    lines.append(f"    - {field}")
+            else:
+                lines.append(f"- Status: {rec}")
+        return "\n".join(lines)
+class SupervisorAgent:
+    """
+    Supervisor Agent to orchestrate the agent pipeline in a serial, chained flow:
+      1. ChecklistCrossReferenceAgent
+      2. AssessmentRecommendationAgent
+      3. OutputFormatterAgent
+    Input:
+        submission_data: Pre-processed submission package data.
+    Output:
+        A final formatted report.
+    """
+    def __init__(self, checklist):
+        self.checklist_agent = ChecklistCrossReferenceAgent(checklist)
+        self.assessment_agent = AssessmentRecommendationAgent()
+        self.formatter_agent = OutputFormatterAgent()
+    def run(self, submission_data):
+        # Step 1: Cross-reference the submission data against the checklist.
+        cross_ref_result = self.checklist_agent.run(submission_data)
+        # Step 2: Analyze the cross-reference result to produce assessment and recommendations.
+        assessment_report = self.assessment_agent.run(cross_ref_result)
+        # Step 3: Format the assessment report for display.
+        formatted_report = self.formatter_agent.run(assessment_report)
+        return formatted_report
+# --- Helper Functions for ZIP Processing ---
+def process_uploaded_zip(uploaded_zip) -> list:
+    """
+    Processes an uploaded ZIP file, caches embeddings, and returns a list of file dictionaries.
+    """
+    submission_data = []
+    with ZipFile(uploaded_zip) as zip_ref:
+        for filename in zip_ref.namelist():
+            file_ext = os.path.splitext(filename)[1].lower()
+            file_bytes = zip_ref.read(filename)
+            content = ""
+            # Generate a unique cache key based on the file content
+            file_hash = hashlib.md5(file_bytes).hexdigest()
+            cache_key = f"{filename}_{file_hash}"
+            cache_file = f".cache/{cache_key}.pkl"  # Cache file path
+            # Create the cache directory if it doesn't exist
+            os.makedirs(".cache", exist_ok=True)
+            if os.path.exists(cache_file):
+                # Load from cache
+                print(f"Loading {filename} from cache")
+                try:
+                    with open(cache_file, "rb") as f:
+                        content = pickle.load(f)
+                except Exception as e:
+                    st.error(f"Error loading {filename} from cache: {str(e)}")
+                    content = ""  # Or handle the error as appropriate
+            else:
+                # Process and cache
+                print(f"Processing {filename} and caching")
+                if file_ext == ".pdf":
+                    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
+                        tmp.write(file_bytes)
+                        tmp.flush()
+                        tmp_path = tmp.name
+                    file_size = os.path.getsize(tmp_path) / (1024 * 1024)
+                    workers = 2 if file_size > 2 else 1
+                    try:
+                        parser = LlamaParse(
+                            api_key=LLAMA_CLOUD_API_KEY,
+                            result_type="markdown",
+                            num_workers=workers,
+                            verbose=True
+                        )
+                        llama_documents = parser.load_data(tmp_path)
+                        content = "\n".join([doc.text for doc in llama_documents])
+                    except Exception as e:
+                        content = f"Error parsing PDF: {str(e)}"
+                        st.error(f"Error parsing PDF {filename}: {str(e)}")
+                    finally:
+                        os.remove(tmp_path)
+                elif file_ext == ".txt":
+                    try:
+                        content = file_bytes.decode("utf-8")
+                    except UnicodeDecodeError:
+                        content = file_bytes.decode("latin1")
+                    except Exception as e:
+                        content = f"Error decoding text file {filename}: {str(e)}"
+                        st.error(f"Error decoding text file {filename}: {str(e)}")
+                else:
+                    continue  # Skip unsupported file types
+                # Save to cache
+                try:
+                    with open(cache_file, "wb") as f:
+                        pickle.dump(content, f)
+                except Exception as e:
+                    st.error(f"Error saving {filename} to cache: {str(e)}")
+            submission_data.append({
+                "filename": filename,
+                "file_type": file_ext.replace(".", ""),
+                "content": content,
+                "metadata": {}
+            })
+    return submission_data
+# --- Streamlit Interface ---
+def main():
+    st.title("Submission Package Assessment")
+    st.write(
+        """
+        Upload a ZIP file containing your submission package.
+        The ZIP file can include PDF and text files.
+        """
+    )
+    uploaded_file = st.file_uploader("Choose a ZIP file", type=["zip"])
+    if uploaded_file is not None:
+        try:
+            # Process the uploaded ZIP file to extract submission data
+            submission_data = process_uploaded_zip(uploaded_file)
+            st.success("File processed successfully!")
+            # Display a summary of the extracted files
+            st.subheader("Extracted Files")
+            for file_info in submission_data:
+                st.write(f"**{file_info['filename']}** - ({file_info['file_type'].upper()})")
+            # Instantiate and run the SupervisorAgent
+            supervisor = SupervisorAgent(IND_CHECKLIST)
+            assessment_report = supervisor.run(submission_data)
+            st.subheader("Assessment Report")
+            st.markdown(assessment_report)
+        except Exception as e:
+            st.error(f"Error processing file: {str(e)}")
+if __name__ == "__main__":
+    # To run with Streamlit, use: streamlit run submission_assessment.py
+    main()

submission_assessment0.py ADDED Viewed

	@@ -0,0 +1,324 @@

+"""
+Submission Assessment Module
+This module implements a LangGraph agentic pipeline to perform
+cross-reference of an uploaded submission package (ZIP file) against a predefined
+IND checklist. It supports processing of both PDF (using LlamaParse in the
+pre-agent phase) and text files.
+A Streamlit interface is provided to allow users to upload a ZIP file and view the assessment report.
+"""
+import os
+import io
+import tempfile
+from zipfile import ZipFile
+import streamlit as st
+# Import LlamaParse for PDF processing (assumes it's installed and configured)
+from llama_parse import LlamaParse
+# Note: These agent classes are implemented for demonstration.
+# In a real-world scenario, you might integrate the official LangGraph agent APIs.
+# Sample Checklist Configuration (this should be adjusted to your actual IND requirements)
+IND_CHECKLIST = {
+    "Investigator Brochure": {
+        "file_patterns": ["brochure", "ib"],
+        "required_keywords": ["pharmacology", "toxicology", "clinical data"]
+    },
+    "Clinical Protocol": {
+        "file_patterns": ["clinical", "protocol"],
+        "required_keywords": ["study design", "objectives", "patient population", "dosing regimen", "endpoints"]
+    },
+    "Form FDA-1571": {
+        "file_patterns": ["1571", "fda-1571"],
+        "required_keywords": [
+            # Sponsor Information
+            "Name of Sponsor",
+            "Date of Submission",
+            "Address 1",
+            "Sponsor Telephone Number",
+            # Drug Information
+            "Name of Drug",
+            "IND Type",
+            "Proposed Indication for Use",
+            # Regulatory Information
+            "Phase of Clinical Investigation",
+            "Serial Number",
+            # Application Contents
+            "Table of Contents",
+            "Investigator's Brochure",
+            "Study protocol",
+            "Investigator data",
+            "Facilities data",
+            "Institutional Review Board data",
+            "Environmental assessment",
+            "Pharmacology and Toxicology",
+            # Signatures and Certifications
+            #"Person Responsible for Clinical Investigation Monitoring",
+            #"Person Responsible for Reviewing Safety Information",
+            "Sponsor or Sponsor's Authorized Representative First Name",
+            "Sponsor or Sponsor's Authorized Representative Last Name",
+            "Sponsor or Sponsor's Authorized Representative Title",
+            "Sponsor or Sponsor's Authorized Representative Telephone Number",
+            "Date of Sponsor's Signature"
+        ]
+    }
+}
+class ChecklistCrossReferenceAgent:
+    """
+    Agent that cross-references the pre-parsed submission package data
+    against a predefined IND checklist.
+    Input:
+        submission_data: list of dicts representing each file with keys:
+            - "filename": Filename of the document.
+            - "file_type": e.g., "pdf" or "txt"
+            - "content": Extracted text from the document.
+            - "metadata": (Optional) Additional metadata.
+        checklist: dict representing the IND checklist.
+    Output:
+        A mapping of checklist items to their verification status.
+    """
+    def __init__(self, checklist):
+        self.checklist = checklist
+    def run(self, submission_data):
+        cross_reference_result = {}
+        for document_name, config in self.checklist.items():
+            file_patterns = config.get("file_patterns", [])
+            required_keywords = config.get("required_keywords", [])
+            matched_file = None
+            # Attempt to find a matching file based on filename patterns.
+            for file_info in submission_data:
+                filename = file_info.get("filename", "").lower()
+                if any(pattern.lower() in filename for pattern in file_patterns):
+                    matched_file = file_info
+                    break
+            # Build the result per checklist item.
+            if not matched_file:
+                # File is completely missing.
+                cross_reference_result[document_name] = {
+                    "status": "missing",
+                    "missing_fields": required_keywords
+                }
+            else:
+                # File found, check if its content includes the required keywords.
+                content = matched_file.get("content", "").lower()
+                missing_fields = []
+                for keyword in required_keywords:
+                    if keyword.lower() not in content:
+                        missing_fields.append(keyword)
+                if missing_fields:
+                    cross_reference_result[document_name] = {
+                        "status": "incomplete",
+                        "missing_fields": missing_fields
+                    }
+                else:
+                    cross_reference_result[document_name] = {
+                        "status": "present",
+                        "missing_fields": []
+                    }
+        return cross_reference_result
+class AssessmentRecommendationAgent:
+    """
+    Agent that analyzes the cross-reference data and produces an
+    assessment report with recommendations.
+    Input:
+        cross_reference_result: dict mapping checklist items to their status.
+    Output:
+        A dict containing an overall compliance flag and detailed recommendations.
+    """
+    def run(self, cross_reference_result):
+        recommendations = {}
+        overall_compliant = True
+        for doc, result in cross_reference_result.items():
+            status = result.get("status")
+            if status == "missing":
+                recommendations[doc] = f"{doc} is missing. Please include the document."
+                overall_compliant = False
+            elif status == "incomplete":
+                missing = ", ".join(result.get("missing_fields", []))
+                recommendations[doc] = (f"{doc} is incomplete. Missing required fields: {missing}. "
+                                        "Please update accordingly.")
+                overall_compliant = False
+            else:
+                recommendations[doc] = f"{doc} is complete."
+        assessment = {
+            "overall_compliant": overall_compliant,
+            "recommendations": recommendations
+        }
+        return assessment
+class OutputFormatterAgent:
+    """
+    Agent that formats the assessment report into a user-friendly format.
+    This example formats the output as Markdown.
+    Input:
+        assessment: dict output from AssessmentRecommendationAgent.
+    Output:
+        A formatted string report.
+    """
+    def run(self, assessment):
+        overall = "Compliant" if assessment.get("overall_compliant") else "Non-Compliant"
+        lines = []
+        lines.append("# Submission Package Assessment Report")
+        lines.append(f"**Overall Compliance:** {overall}\n")
+        recommendations = assessment.get("recommendations", {})
+        for doc, rec in recommendations.items():
+            lines.append(f"### {doc}")
+            # Format recommendations as bullet points
+            if "incomplete" in rec.lower():
+                missing_fields = rec.split("Missing required fields: ")[1].split(".")[0].split(", ")
+                lines.append("- Status: Incomplete")
+                lines.append("  - Missing Fields:")
+                for field in missing_fields:
+                    lines.append(f"    - {field}")
+            else:
+                lines.append(f"- Status: {rec}")
+        return "\n".join(lines)
+class SupervisorAgent:
+    """
+    Supervisor Agent to orchestrate the agent pipeline in a serial, chained flow:
+      1. ChecklistCrossReferenceAgent
+      2. AssessmentRecommendationAgent
+      3. OutputFormatterAgent
+    Input:
+        submission_data: Pre-processed submission package data.
+    Output:
+        A final formatted report.
+    """
+    def __init__(self, checklist):
+        self.checklist_agent = ChecklistCrossReferenceAgent(checklist)
+        self.assessment_agent = AssessmentRecommendationAgent()
+        self.formatter_agent = OutputFormatterAgent()
+    def run(self, submission_data):
+        # Step 1: Cross-reference the submission data against the checklist.
+        cross_ref_result = self.checklist_agent.run(submission_data)
+        # Step 2: Analyze the cross-reference result to produce assessment and recommendations.
+        assessment_report = self.assessment_agent.run(cross_ref_result)
+        # Step 3: Format the assessment report for display.
+        formatted_report = self.formatter_agent.run(assessment_report)
+        return formatted_report
+# --- Helper Functions for ZIP Processing ---
+def process_uploaded_zip(uploaded_zip) -> list:
+    """
+    Processes an uploaded ZIP file (as BytesIO) and returns a list of file dictionaries.
+    Each dictionary contains:
+       - filename: name of the file.
+       - file_type: determined from the extension.
+       - content: extracted text content.
+       - metadata: additional metadata (currently empty).
+    For PDF files, uses LlamaParse for parsing.
+    For TXT files, reads the text directly.
+    """
+    submission_data = []
+    # Open the uploaded zip file from the BytesIO buffer.
+    with ZipFile(uploaded_zip) as zip_ref:
+        for filename in zip_ref.namelist():
+            file_ext = os.path.splitext(filename)[1].lower()
+            # Read file bytes
+            file_bytes = zip_ref.read(filename)
+            content = ""
+            if file_ext == ".pdf":
+                # Create a temporary file for the PDF
+                with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
+                    tmp.write(file_bytes)
+                    tmp.flush()
+                    tmp_path = tmp.name
+                # Determine number of workers based on file size (in MB)
+                file_size = os.path.getsize(tmp_path) / (1024 * 1024)
+                workers = 2 if file_size > 2 else 1
+                # Initialize LlamaParse and extract content
+                parser = LlamaParse(
+                    api_key=os.getenv("LLAMA_CLOUD_API_KEY"),
+                    result_type="markdown",
+                    num_workers=workers,
+                    verbose=True
+                )
+                try:
+                    # Load and parse the PDF file
+                    llama_documents = parser.load_data(tmp_path)
+                    # Aggregate text from parsed documents
+                    content = "\n".join([doc.text for doc in llama_documents])
+                except Exception as e:
+                    content = f"Error parsing PDF: {str(e)}"
+                finally:
+                    os.remove(tmp_path)
+            elif file_ext == ".txt":
+                # Decode text content from bytes
+                try:
+                    content = file_bytes.decode("utf-8")
+                except UnicodeDecodeError:
+                    content = file_bytes.decode("latin1")
+            else:
+                # Skip unsupported file types
+                continue
+            submission_data.append({
+                "filename": filename,
+                "file_type": file_ext.replace(".", ""),
+                "content": content,
+                "metadata": {}
+            })
+    return submission_data
+# --- Streamlit Interface ---
+def main():
+    st.title("Submission Package Assessment")
+    st.write(
+        """
+        Upload a ZIP file containing your submission package.
+        The ZIP file can include PDF and text files.
+        """
+    )
+    uploaded_file = st.file_uploader("Choose a ZIP file", type=["zip"])
+    if uploaded_file is not None:
+        try:
+            # Process the uploaded ZIP file to extract submission data
+            submission_data = process_uploaded_zip(uploaded_file)
+            st.success("File processed successfully!")
+            # Display a summary of the extracted files
+            st.subheader("Extracted Files")
+            for file_info in submission_data:
+                st.write(f"**{file_info['filename']}** - ({file_info['file_type'].upper()})")
+            # Instantiate and run the SupervisorAgent
+            supervisor = SupervisorAgent(IND_CHECKLIST)
+            assessment_report = supervisor.run(submission_data)
+            st.subheader("Assessment Report")
+            st.markdown(assessment_report)
+        except Exception as e:
+            st.error(f"Error processing file: {str(e)}")
+if __name__ == "__main__":
+    # To run with Streamlit, use: streamlit run submission_assessment.py
+    main()

template.md ADDED Viewed

	@@ -0,0 +1,72 @@

+1. Pre-IND Meeting Preparation
+Request a Pre-IND Meeting: Schedule a meeting with the FDA to discuss your IND submission.
+Prepare Meeting Package: Include proposed clinical trial design, preclinical data, manufacturing information, and any other relevant data.
+Submit Questions: Prepare a list of specific questions for the FDA regarding your IND submission.
+2. Form FDA 1571
+Complete Form FDA 1571: Ensure all sections are filled out accurately, including sponsor information, drug information, and clinical trial details.
+Signature: Obtain the required signature from the sponsor or authorized representative.
+3. Table of Contents
+Create a Comprehensive Table of Contents: Organize the IND submission with clear sections and page numbers for easy navigation.
+4. Introductory Statement and General Investigational Plan
+Introductory Statement: Provide a brief overview of the drug, including its name, structure, and pharmacological class.
+General Investigational Plan: Outline the clinical development plan, including the objectives and duration of the proposed studies.
+5. Investigator's Brochure
+Compile the Investigator's Brochure: Include all relevant information about the drug, such as its formulation, pharmacology, toxicology, and clinical data.
+Update as Necessary: Ensure the brochure is up-to-date with the latest data.
+6. Clinical Protocol
+Develop Clinical Protocol: Detail the study design, including objectives, patient population, dosing regimen, and endpoints.
+Inclusion/Exclusion Criteria: Clearly define the criteria for patient selection.
+Safety Monitoring: Outline the procedures for monitoring patient safety.
+7. Chemistry, Manufacturing, and Control (CMC) Information
+Drug Substance Information: Provide details on the drug substance, including its manufacture, characterization, and controls.
+Drug Product Information: Include information on the drug product, such as formulation, manufacturing process, and specifications.
+Stability Data: Submit stability data to support the proposed shelf life of the drug.
+Labeling: Provide draft labeling for the investigational drug.
+8. Pharmacology and Toxicology Data
+Pharmacology Studies: Submit data from in vitro and in vivo studies that demonstrate the drug's pharmacological effects.
+Toxicology Studies: Include data from acute, subacute, and chronic toxicity studies, as well as reproductive and genotoxicity studies.
+Safety Pharmacology: Provide data on the drug's effects on vital organ systems.
+9. Previous Human Experience
+Summarize Previous Human Experience: If applicable, include data from previous clinical trials or use in humans.
+Safety and Efficacy Data: Highlight any relevant safety and efficacy findings from prior studies.
+10. Additional Information
+Environmental Assessment: Submit an environmental assessment or claim an exclusion if applicable.
+Special Considerations: Include any additional information that may be relevant, such as data from pediatric studies or risk management plans.
+11. Review and Quality Control
+Internal Review: Conduct a thorough internal review of the IND submission to ensure accuracy and completeness.
+Quality Control: Verify that all data and documents meet regulatory standards and guidelines.
+12. Submission to FDA
+Compile the IND Submission: Assemble all sections into a single, well-organized submission.
+Submit to FDA: Send the IND submission to the appropriate FDA division via the required submission method (e.g., electronic submission).
+Confirmation of Receipt: Obtain confirmation from the FDA that the IND has been received and is under review.