philipk22 commited on
Commit
2eeebbc
Β·
1 Parent(s): eee67e7

Initial add from the remote

Browse files
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__/
2
+ .cache/
IND-312.pdf ADDED
Binary file (423 kB). View file
 
README.md CHANGED
@@ -1,12 +1,6 @@
1
- ---
2
- title: Hf Indassistant
3
- emoji: πŸ¦€
4
- colorFrom: gray
5
- colorTo: yellow
6
  sdk: streamlit
7
- sdk_version: 1.42.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ title: IND Assistant Application
2
+ emoji: πŸ“š
3
+ colorFrom: blue
4
+ colorTo: green
 
5
  sdk: streamlit
6
+ app_port: 8860
 
 
 
 
 
ind_checklist_stlit.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from typing import List
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain_community.vectorstores import Qdrant
6
+ from langchain_community.embeddings import HuggingFaceBgeEmbeddings
7
+ from langchain_openai.chat_models import ChatOpenAI
8
+ from langchain.prompts import ChatPromptTemplate
9
+ from langchain.schema.runnable import RunnablePassthrough
10
+ from langchain_core.output_parsers import StrOutputParser
11
+ from operator import itemgetter
12
+ import nest_asyncio
13
+ from langchain.schema import Document
14
+
15
+ # Apply nest_asyncio for async operations
16
+ nest_asyncio.apply()
17
+
18
+ # Set environment variables for API keys
19
+ os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY") # OpenAI API Key
20
+ os.environ["LLAMA_CLOUD_API_KEY"] = os.getenv("LLAMA_CLOUD_API_KEY") # Llama Cloud API Key
21
+
22
+ # File paths
23
+ PDF_FILE = "IND-312.pdf"
24
+ PREPROCESSED_FILE = "preprocessed_docs.json"
25
+
26
+ # Load and parse PDF (only for preprocessing)
27
+ def load_pdf(pdf_path: str) -> List[Document]:
28
+ """Loads a PDF, processes it with LlamaParse, and splits it into LangChain documents."""
29
+ from llama_parse import LlamaParse # Import only if needed
30
+
31
+ file_size = os.path.getsize(pdf_path) / (1024 * 1024) # Size in MB
32
+ workers = 2 if file_size > 2 else 1 # Use 2 workers for PDFs >2MB
33
+
34
+ parser = LlamaParse(
35
+ api_key=os.environ["LLAMA_CLOUD_API_KEY"],
36
+ result_type="markdown",
37
+ num_workers=workers,
38
+ verbose=True
39
+ )
40
+
41
+ # Parse PDF to documents
42
+ llama_documents = parser.load_data(pdf_path)
43
+
44
+ # Convert to LangChain documents
45
+ documents = [
46
+ Document(
47
+ page_content=doc.text,
48
+ metadata={"source": pdf_path, "page": doc.metadata.get("page_number", 0)}
49
+ ) for doc in llama_documents
50
+ ]
51
+
52
+ # Split documents into chunks
53
+ text_splitter = RecursiveCharacterTextSplitter(
54
+ chunk_size=500,
55
+ chunk_overlap=50,
56
+ length_function=len,
57
+ )
58
+
59
+ return text_splitter.split_documents(documents)
60
+
61
+ # Preprocess the PDF and save to JSON (Only if it doesn't exist)
62
+ def preprocess_pdf(pdf_path: str, output_path: str = PREPROCESSED_FILE):
63
+ """Preprocess PDF only if the output file does not exist."""
64
+ if os.path.exists(output_path):
65
+ print(f"Preprocessed data already exists at {output_path}. Skipping PDF processing.")
66
+ return # Skip processing if file already exists
67
+
68
+ print("Processing PDF for the first time...")
69
+
70
+ documents = load_pdf(pdf_path) # Load and process the PDF
71
+
72
+ # Convert documents to JSON format
73
+ json_data = [{"content": doc.page_content, "metadata": doc.metadata} for doc in documents]
74
+
75
+ # Save to file
76
+ with open(output_path, "w", encoding="utf-8") as f:
77
+ json.dump(json_data, f, indent=4)
78
+
79
+ print(f"Preprocessed PDF saved to {output_path}")
80
+
81
+ # Load preprocessed data instead of parsing PDF
82
+ def load_preprocessed_data(json_path: str) -> List[Document]:
83
+ """Load preprocessed data from JSON."""
84
+ if not os.path.exists(json_path):
85
+ raise FileNotFoundError(f"Preprocessed file {json_path} not found. Run preprocessing first.")
86
+
87
+ with open(json_path, "r", encoding="utf-8") as f:
88
+ json_data = json.load(f)
89
+
90
+ return [Document(page_content=d["content"], metadata=d["metadata"]) for d in json_data]
91
+
92
+ # Initialize vector store from preprocessed data
93
+ def init_vector_store(documents: List[Document]):
94
+ """Initialize a vector store using HuggingFace embeddings and Qdrant."""
95
+ if not documents or not all(doc.page_content.strip() for doc in documents):
96
+ raise ValueError("No valid documents found for vector storage")
97
+
98
+ # Initialize embedding model
99
+ embedding_model = HuggingFaceBgeEmbeddings(
100
+ model_name="BAAI/bge-base-en-v1.5",
101
+ encode_kwargs={'normalize_embeddings': True}
102
+ )
103
+
104
+ return Qdrant.from_documents(
105
+ documents=documents,
106
+ embedding=embedding_model,
107
+ location=":memory:",
108
+ collection_name="ind312_docs",
109
+ force_recreate=False
110
+ )
111
+
112
+ # Create RAG chain for retrieval-based Q&A
113
+ def create_rag_chain(retriever):
114
+ """Create a retrieval-augmented generation (RAG) chain for answering questions."""
115
+ # Load prompt template
116
+ with open("template.md") as f:
117
+ template_content = f.read()
118
+
119
+ prompt = ChatPromptTemplate.from_template("""
120
+ You are an FDA regulatory expert. Use this structure for checklists:
121
+ {template}
122
+
123
+ Context from IND-312:
124
+ {context}
125
+
126
+ Question: {question}
127
+
128
+ Answer in Markdown with checkboxes (- [ ]). If unsure, say "I can only answer IND related questions.".
129
+ """)
130
+
131
+ return (
132
+ {
133
+ "context": itemgetter("question") | retriever,
134
+ "question": itemgetter("question"),
135
+ "template": lambda _: template_content # Inject template content
136
+ }
137
+ | RunnablePassthrough.assign(context=itemgetter("context"))
138
+ | {"response": prompt | ChatOpenAI(model="gpt-4") | StrOutputParser()}
139
+ )
140
+
141
+ # Run preprocessing only if executed directly (NOT when imported)
142
+ if __name__ == "__main__":
143
+ preprocess_pdf(PDF_FILE)
144
+
preprocessed_docs.json ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ openai>=1.0.0
2
+ langchain>=0.0.148
3
+ langchain-openai>=0.0.1
4
+ langchain-community>=0.1.0
5
+ streamlit>=1.32.0
6
+ qdrant-client>=0.3.0
7
+ llama-parse>=0.0.1
8
+ nest-asyncio>=1.5.6
9
+ torch>=2.0.0
10
+ sentence-transformers>=2.2.2
11
+ langgraph>=0.1.0
streamlit_app.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import streamlit as st
4
+ from ind_checklist_stlit import load_preprocessed_data, init_vector_store, create_rag_chain
5
+
6
+ # Prevent Streamlit from auto-reloading on file changes
7
+ os.environ["STREAMLIT_WATCHER_TYPE"] = "none"
8
+
9
+ # Define the preprocessed file path
10
+ PREPROCESSED_FILE = "preprocessed_docs.json"
11
+
12
+ # Caching function to prevent redundant RAG processing
13
+ @st.cache_data
14
+ def cached_response(question: str):
15
+ """Retrieve cached response if available, otherwise compute response."""
16
+ return st.session_state.rag_chain.invoke({"question": question})["response"]
17
+
18
+ def main():
19
+ st.title("Appian IND Application Assistant")
20
+ st.markdown("Chat about Investigational New Drug Applications")
21
+
22
+ # Button to clear chat history
23
+ if st.button("Clear Chat History"):
24
+ st.session_state.messages = []
25
+ st.rerun()
26
+
27
+ # Initialize session state
28
+ if "messages" not in st.session_state:
29
+ st.session_state.messages = []
30
+
31
+ # Load preprocessed data and initialize the RAG chain
32
+ if "rag_chain" not in st.session_state:
33
+ if not os.path.exists(PREPROCESSED_FILE):
34
+ st.error(f"❌ Preprocessed file '{PREPROCESSED_FILE}' not found. Please run preprocessing first.")
35
+ return # Stop execution if preprocessed data is missing
36
+
37
+ with st.spinner("πŸ”„ Initializing knowledge base..."):
38
+ documents = load_preprocessed_data(PREPROCESSED_FILE)
39
+ vectorstore = init_vector_store(documents)
40
+ st.session_state.rag_chain = create_rag_chain(vectorstore.as_retriever())
41
+
42
+ # Display chat history
43
+ for message in st.session_state.messages:
44
+ with st.chat_message(message["role"]):
45
+ st.markdown(message["content"])
46
+
47
+ # Chat input and response handling
48
+ if prompt := st.chat_input("Ask about IND requirements"):
49
+ st.session_state.messages.append({"role": "user", "content": prompt})
50
+
51
+ # Display user message
52
+ with st.chat_message("user"):
53
+ st.markdown(prompt)
54
+
55
+ # Generate response (cached if already asked before)
56
+ with st.chat_message("assistant"):
57
+ response = cached_response(prompt)
58
+ st.markdown(response)
59
+
60
+ # Store bot response in chat history
61
+ st.session_state.messages.append({"role": "assistant", "content": response})
62
+
63
+ if __name__ == "__main__":
64
+ main()
65
+
submission_assessment.py ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Submission Assessment Module
3
+
4
+ This module implements a LangGraph agentic pipeline to perform
5
+ cross-reference of an uploaded submission package (ZIP file) against a predefined
6
+ IND checklist. It supports processing of both PDF (using LlamaParse in the
7
+ pre-agent phase) and text files.
8
+
9
+ A Streamlit interface is provided to allow users to upload a ZIP file and view the assessment report.
10
+ """
11
+
12
+ import os
13
+ import io
14
+ import tempfile
15
+ from zipfile import ZipFile
16
+ import streamlit as st
17
+ from llama_parse import LlamaParse
18
+
19
+ import pickle
20
+ import hashlib
21
+
22
+
23
+ # Access API key from environment variable
24
+ LLAMA_CLOUD_API_KEY = os.environ.get("LLAMA_CLOUD_API_KEY")
25
+
26
+ # Check if the API key is available
27
+ if not LLAMA_CLOUD_API_KEY:
28
+ st.error("LLAMA_CLOUD_API_KEY not found in environment variables. Please set it in your Hugging Face Space secrets.")
29
+ st.stop()
30
+
31
+ # Sample Checklist Configuration (this should be adjusted to your actual IND requirements)
32
+ IND_CHECKLIST = {
33
+ "Investigator Brochure": {
34
+ "file_patterns": ["brochure", "ib"],
35
+ "required_keywords": ["pharmacology", "toxicology", "clinical data"]
36
+ },
37
+ "Clinical Protocol": {
38
+ "file_patterns": ["clinical", "protocol"],
39
+ "required_keywords": ["study design", "objectives", "patient population", "dosing regimen", "endpoints"]
40
+ },
41
+ "Form FDA-1571": {
42
+ "file_patterns": ["1571", "fda-1571"],
43
+ "required_keywords": [
44
+ # Sponsor Information
45
+ "Name of Sponsor",
46
+ "Date of Submission",
47
+ "Address 1",
48
+ "Sponsor Telephone Number",
49
+ # Drug Information
50
+ "Name of Drug",
51
+ "IND Type",
52
+ "Proposed Indication for Use",
53
+ # Regulatory Information
54
+ "Phase of Clinical Investigation",
55
+ "Serial Number",
56
+ # Application Contents
57
+ "Table of Contents",
58
+ "Investigator's Brochure",
59
+ "Study protocol",
60
+ "Investigator data",
61
+ "Facilities data",
62
+ "Institutional Review Board data",
63
+ "Environmental assessment",
64
+ "Pharmacology and Toxicology",
65
+ # Signatures and Certifications
66
+ #"Person Responsible for Clinical Investigation Monitoring",
67
+ #"Person Responsible for Reviewing Safety Information",
68
+ "Sponsor or Sponsor's Authorized Representative First Name",
69
+ "Sponsor or Sponsor's Authorized Representative Last Name",
70
+ "Sponsor or Sponsor's Authorized Representative Title",
71
+ "Sponsor or Sponsor's Authorized Representative Telephone Number",
72
+ "Date of Sponsor's Signature"
73
+ ]
74
+ }
75
+ }
76
+
77
+
78
+ class ChecklistCrossReferenceAgent:
79
+ """
80
+ Agent that cross-references the pre-parsed submission package data
81
+ against a predefined IND checklist.
82
+
83
+ Input:
84
+ submission_data: list of dicts representing each file with keys:
85
+ - "filename": Filename of the document.
86
+ - "file_type": e.g., "pdf" or "txt"
87
+ - "content": Extracted text from the document.
88
+ - "metadata": (Optional) Additional metadata.
89
+ checklist: dict representing the IND checklist.
90
+ Output:
91
+ A mapping of checklist items to their verification status.
92
+ """
93
+ def __init__(self, checklist):
94
+ self.checklist = checklist
95
+
96
+ def run(self, submission_data):
97
+ cross_reference_result = {}
98
+ for document_name, config in self.checklist.items():
99
+ file_patterns = config.get("file_patterns", [])
100
+ required_keywords = config.get("required_keywords", [])
101
+ matched_file = None
102
+
103
+ # Attempt to find a matching file based on filename patterns.
104
+ for file_info in submission_data:
105
+ filename = file_info.get("filename", "").lower()
106
+ if any(pattern.lower() in filename for pattern in file_patterns):
107
+ matched_file = file_info
108
+ break
109
+
110
+ # Build the result per checklist item.
111
+ if not matched_file:
112
+ # File is completely missing.
113
+ cross_reference_result[document_name] = {
114
+ "status": "missing",
115
+ "missing_fields": required_keywords
116
+ }
117
+ else:
118
+ # File found, check if its content includes the required keywords.
119
+ content = matched_file.get("content", "").lower()
120
+ missing_fields = []
121
+ for keyword in required_keywords:
122
+ if keyword.lower() not in content:
123
+ missing_fields.append(keyword)
124
+ if missing_fields:
125
+ cross_reference_result[document_name] = {
126
+ "status": "incomplete",
127
+ "missing_fields": missing_fields
128
+ }
129
+ else:
130
+ cross_reference_result[document_name] = {
131
+ "status": "present",
132
+ "missing_fields": []
133
+ }
134
+ return cross_reference_result
135
+
136
+
137
+ class AssessmentRecommendationAgent:
138
+ """
139
+ Agent that analyzes the cross-reference data and produces an
140
+ assessment report with recommendations.
141
+
142
+ Input:
143
+ cross_reference_result: dict mapping checklist items to their status.
144
+ Output:
145
+ A dict containing an overall compliance flag and detailed recommendations.
146
+ """
147
+ def run(self, cross_reference_result):
148
+ recommendations = {}
149
+ overall_compliant = True
150
+
151
+ for doc, result in cross_reference_result.items():
152
+ status = result.get("status")
153
+ if status == "missing":
154
+ recommendations[doc] = f"{doc} is missing. Please include the document."
155
+ overall_compliant = False
156
+ elif status == "incomplete":
157
+ missing = ", ".join(result.get("missing_fields", []))
158
+ recommendations[doc] = (f"{doc} is incomplete. Missing required fields: {missing}. "
159
+ "Please update accordingly.")
160
+ overall_compliant = False
161
+ else:
162
+ recommendations[doc] = f"{doc} is complete."
163
+ assessment = {
164
+ "overall_compliant": overall_compliant,
165
+ "recommendations": recommendations
166
+ }
167
+ return assessment
168
+
169
+
170
+ class OutputFormatterAgent:
171
+ """
172
+ Agent that formats the assessment report into a user-friendly format.
173
+ This example formats the output as Markdown.
174
+
175
+ Input:
176
+ assessment: dict output from AssessmentRecommendationAgent.
177
+ Output:
178
+ A formatted string report.
179
+ """
180
+ def run(self, assessment):
181
+ overall = "Compliant" if assessment.get("overall_compliant") else "Non-Compliant"
182
+ lines = []
183
+ lines.append("# Submission Package Assessment Report")
184
+ lines.append(f"**Overall Compliance:** {overall}\n")
185
+ recommendations = assessment.get("recommendations", {})
186
+ for doc, rec in recommendations.items():
187
+ lines.append(f"### {doc}")
188
+ # Format recommendations as bullet points
189
+ if "incomplete" in rec.lower():
190
+ missing_fields = rec.split("Missing required fields: ")[1].split(".")[0].split(", ")
191
+ lines.append("- Status: Incomplete")
192
+ lines.append(" - Missing Fields:")
193
+ for field in missing_fields:
194
+ lines.append(f" - {field}")
195
+ else:
196
+ lines.append(f"- Status: {rec}")
197
+ return "\n".join(lines)
198
+
199
+
200
+ class SupervisorAgent:
201
+ """
202
+ Supervisor Agent to orchestrate the agent pipeline in a serial, chained flow:
203
+
204
+ 1. ChecklistCrossReferenceAgent
205
+ 2. AssessmentRecommendationAgent
206
+ 3. OutputFormatterAgent
207
+
208
+ Input:
209
+ submission_data: Pre-processed submission package data.
210
+ Output:
211
+ A final formatted report.
212
+ """
213
+ def __init__(self, checklist):
214
+ self.checklist_agent = ChecklistCrossReferenceAgent(checklist)
215
+ self.assessment_agent = AssessmentRecommendationAgent()
216
+ self.formatter_agent = OutputFormatterAgent()
217
+
218
+ def run(self, submission_data):
219
+ # Step 1: Cross-reference the submission data against the checklist.
220
+ cross_ref_result = self.checklist_agent.run(submission_data)
221
+ # Step 2: Analyze the cross-reference result to produce assessment and recommendations.
222
+ assessment_report = self.assessment_agent.run(cross_ref_result)
223
+ # Step 3: Format the assessment report for display.
224
+ formatted_report = self.formatter_agent.run(assessment_report)
225
+ return formatted_report
226
+
227
+
228
+ # --- Helper Functions for ZIP Processing ---
229
+
230
+ def process_uploaded_zip(uploaded_zip) -> list:
231
+ """
232
+ Processes an uploaded ZIP file, caches embeddings, and returns a list of file dictionaries.
233
+ """
234
+ submission_data = []
235
+
236
+ with ZipFile(uploaded_zip) as zip_ref:
237
+ for filename in zip_ref.namelist():
238
+ file_ext = os.path.splitext(filename)[1].lower()
239
+ file_bytes = zip_ref.read(filename)
240
+ content = ""
241
+
242
+ # Generate a unique cache key based on the file content
243
+ file_hash = hashlib.md5(file_bytes).hexdigest()
244
+ cache_key = f"{filename}_{file_hash}"
245
+ cache_file = f".cache/{cache_key}.pkl" # Cache file path
246
+
247
+ # Create the cache directory if it doesn't exist
248
+ os.makedirs(".cache", exist_ok=True)
249
+
250
+ if os.path.exists(cache_file):
251
+ # Load from cache
252
+ print(f"Loading {filename} from cache")
253
+ try:
254
+ with open(cache_file, "rb") as f:
255
+ content = pickle.load(f)
256
+ except Exception as e:
257
+ st.error(f"Error loading {filename} from cache: {str(e)}")
258
+ content = "" # Or handle the error as appropriate
259
+ else:
260
+ # Process and cache
261
+ print(f"Processing {filename} and caching")
262
+ if file_ext == ".pdf":
263
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
264
+ tmp.write(file_bytes)
265
+ tmp.flush()
266
+ tmp_path = tmp.name
267
+ file_size = os.path.getsize(tmp_path) / (1024 * 1024)
268
+ workers = 2 if file_size > 2 else 1
269
+ try:
270
+ parser = LlamaParse(
271
+ api_key=LLAMA_CLOUD_API_KEY,
272
+ result_type="markdown",
273
+ num_workers=workers,
274
+ verbose=True
275
+ )
276
+ llama_documents = parser.load_data(tmp_path)
277
+ content = "\n".join([doc.text for doc in llama_documents])
278
+ except Exception as e:
279
+ content = f"Error parsing PDF: {str(e)}"
280
+ st.error(f"Error parsing PDF {filename}: {str(e)}")
281
+ finally:
282
+ os.remove(tmp_path)
283
+ elif file_ext == ".txt":
284
+ try:
285
+ content = file_bytes.decode("utf-8")
286
+ except UnicodeDecodeError:
287
+ content = file_bytes.decode("latin1")
288
+ except Exception as e:
289
+ content = f"Error decoding text file {filename}: {str(e)}"
290
+ st.error(f"Error decoding text file {filename}: {str(e)}")
291
+ else:
292
+ continue # Skip unsupported file types
293
+
294
+ # Save to cache
295
+ try:
296
+ with open(cache_file, "wb") as f:
297
+ pickle.dump(content, f)
298
+ except Exception as e:
299
+ st.error(f"Error saving {filename} to cache: {str(e)}")
300
+
301
+ submission_data.append({
302
+ "filename": filename,
303
+ "file_type": file_ext.replace(".", ""),
304
+ "content": content,
305
+ "metadata": {}
306
+ })
307
+ return submission_data
308
+
309
+
310
+ # --- Streamlit Interface ---
311
+
312
+ def main():
313
+ st.title("Submission Package Assessment")
314
+ st.write(
315
+ """
316
+ Upload a ZIP file containing your submission package.
317
+ The ZIP file can include PDF and text files.
318
+ """
319
+ )
320
+
321
+ uploaded_file = st.file_uploader("Choose a ZIP file", type=["zip"])
322
+
323
+ if uploaded_file is not None:
324
+ try:
325
+ # Process the uploaded ZIP file to extract submission data
326
+ submission_data = process_uploaded_zip(uploaded_file)
327
+ st.success("File processed successfully!")
328
+
329
+ # Display a summary of the extracted files
330
+ st.subheader("Extracted Files")
331
+ for file_info in submission_data:
332
+ st.write(f"**{file_info['filename']}** - ({file_info['file_type'].upper()})")
333
+
334
+ # Instantiate and run the SupervisorAgent
335
+ supervisor = SupervisorAgent(IND_CHECKLIST)
336
+ assessment_report = supervisor.run(submission_data)
337
+
338
+ st.subheader("Assessment Report")
339
+ st.markdown(assessment_report)
340
+ except Exception as e:
341
+ st.error(f"Error processing file: {str(e)}")
342
+
343
+
344
+ if __name__ == "__main__":
345
+ # To run with Streamlit, use: streamlit run submission_assessment.py
346
+ main()
submission_assessment0.py ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Submission Assessment Module
3
+
4
+ This module implements a LangGraph agentic pipeline to perform
5
+ cross-reference of an uploaded submission package (ZIP file) against a predefined
6
+ IND checklist. It supports processing of both PDF (using LlamaParse in the
7
+ pre-agent phase) and text files.
8
+
9
+ A Streamlit interface is provided to allow users to upload a ZIP file and view the assessment report.
10
+ """
11
+
12
+ import os
13
+ import io
14
+ import tempfile
15
+ from zipfile import ZipFile
16
+
17
+ import streamlit as st
18
+
19
+ # Import LlamaParse for PDF processing (assumes it's installed and configured)
20
+ from llama_parse import LlamaParse
21
+
22
+ # Note: These agent classes are implemented for demonstration.
23
+ # In a real-world scenario, you might integrate the official LangGraph agent APIs.
24
+
25
+ # Sample Checklist Configuration (this should be adjusted to your actual IND requirements)
26
+ IND_CHECKLIST = {
27
+ "Investigator Brochure": {
28
+ "file_patterns": ["brochure", "ib"],
29
+ "required_keywords": ["pharmacology", "toxicology", "clinical data"]
30
+ },
31
+ "Clinical Protocol": {
32
+ "file_patterns": ["clinical", "protocol"],
33
+ "required_keywords": ["study design", "objectives", "patient population", "dosing regimen", "endpoints"]
34
+ },
35
+ "Form FDA-1571": {
36
+ "file_patterns": ["1571", "fda-1571"],
37
+ "required_keywords": [
38
+ # Sponsor Information
39
+ "Name of Sponsor",
40
+ "Date of Submission",
41
+ "Address 1",
42
+ "Sponsor Telephone Number",
43
+ # Drug Information
44
+ "Name of Drug",
45
+ "IND Type",
46
+ "Proposed Indication for Use",
47
+ # Regulatory Information
48
+ "Phase of Clinical Investigation",
49
+ "Serial Number",
50
+ # Application Contents
51
+ "Table of Contents",
52
+ "Investigator's Brochure",
53
+ "Study protocol",
54
+ "Investigator data",
55
+ "Facilities data",
56
+ "Institutional Review Board data",
57
+ "Environmental assessment",
58
+ "Pharmacology and Toxicology",
59
+ # Signatures and Certifications
60
+ #"Person Responsible for Clinical Investigation Monitoring",
61
+ #"Person Responsible for Reviewing Safety Information",
62
+ "Sponsor or Sponsor's Authorized Representative First Name",
63
+ "Sponsor or Sponsor's Authorized Representative Last Name",
64
+ "Sponsor or Sponsor's Authorized Representative Title",
65
+ "Sponsor or Sponsor's Authorized Representative Telephone Number",
66
+ "Date of Sponsor's Signature"
67
+ ]
68
+ }
69
+ }
70
+
71
+
72
+ class ChecklistCrossReferenceAgent:
73
+ """
74
+ Agent that cross-references the pre-parsed submission package data
75
+ against a predefined IND checklist.
76
+
77
+ Input:
78
+ submission_data: list of dicts representing each file with keys:
79
+ - "filename": Filename of the document.
80
+ - "file_type": e.g., "pdf" or "txt"
81
+ - "content": Extracted text from the document.
82
+ - "metadata": (Optional) Additional metadata.
83
+ checklist: dict representing the IND checklist.
84
+ Output:
85
+ A mapping of checklist items to their verification status.
86
+ """
87
+ def __init__(self, checklist):
88
+ self.checklist = checklist
89
+
90
+ def run(self, submission_data):
91
+ cross_reference_result = {}
92
+ for document_name, config in self.checklist.items():
93
+ file_patterns = config.get("file_patterns", [])
94
+ required_keywords = config.get("required_keywords", [])
95
+ matched_file = None
96
+
97
+ # Attempt to find a matching file based on filename patterns.
98
+ for file_info in submission_data:
99
+ filename = file_info.get("filename", "").lower()
100
+ if any(pattern.lower() in filename for pattern in file_patterns):
101
+ matched_file = file_info
102
+ break
103
+
104
+ # Build the result per checklist item.
105
+ if not matched_file:
106
+ # File is completely missing.
107
+ cross_reference_result[document_name] = {
108
+ "status": "missing",
109
+ "missing_fields": required_keywords
110
+ }
111
+ else:
112
+ # File found, check if its content includes the required keywords.
113
+ content = matched_file.get("content", "").lower()
114
+ missing_fields = []
115
+ for keyword in required_keywords:
116
+ if keyword.lower() not in content:
117
+ missing_fields.append(keyword)
118
+ if missing_fields:
119
+ cross_reference_result[document_name] = {
120
+ "status": "incomplete",
121
+ "missing_fields": missing_fields
122
+ }
123
+ else:
124
+ cross_reference_result[document_name] = {
125
+ "status": "present",
126
+ "missing_fields": []
127
+ }
128
+ return cross_reference_result
129
+
130
+
131
+ class AssessmentRecommendationAgent:
132
+ """
133
+ Agent that analyzes the cross-reference data and produces an
134
+ assessment report with recommendations.
135
+
136
+ Input:
137
+ cross_reference_result: dict mapping checklist items to their status.
138
+ Output:
139
+ A dict containing an overall compliance flag and detailed recommendations.
140
+ """
141
+ def run(self, cross_reference_result):
142
+ recommendations = {}
143
+ overall_compliant = True
144
+
145
+ for doc, result in cross_reference_result.items():
146
+ status = result.get("status")
147
+ if status == "missing":
148
+ recommendations[doc] = f"{doc} is missing. Please include the document."
149
+ overall_compliant = False
150
+ elif status == "incomplete":
151
+ missing = ", ".join(result.get("missing_fields", []))
152
+ recommendations[doc] = (f"{doc} is incomplete. Missing required fields: {missing}. "
153
+ "Please update accordingly.")
154
+ overall_compliant = False
155
+ else:
156
+ recommendations[doc] = f"{doc} is complete."
157
+ assessment = {
158
+ "overall_compliant": overall_compliant,
159
+ "recommendations": recommendations
160
+ }
161
+ return assessment
162
+
163
+
164
+ class OutputFormatterAgent:
165
+ """
166
+ Agent that formats the assessment report into a user-friendly format.
167
+ This example formats the output as Markdown.
168
+
169
+ Input:
170
+ assessment: dict output from AssessmentRecommendationAgent.
171
+ Output:
172
+ A formatted string report.
173
+ """
174
+ def run(self, assessment):
175
+ overall = "Compliant" if assessment.get("overall_compliant") else "Non-Compliant"
176
+ lines = []
177
+ lines.append("# Submission Package Assessment Report")
178
+ lines.append(f"**Overall Compliance:** {overall}\n")
179
+ recommendations = assessment.get("recommendations", {})
180
+ for doc, rec in recommendations.items():
181
+ lines.append(f"### {doc}")
182
+ # Format recommendations as bullet points
183
+ if "incomplete" in rec.lower():
184
+ missing_fields = rec.split("Missing required fields: ")[1].split(".")[0].split(", ")
185
+ lines.append("- Status: Incomplete")
186
+ lines.append(" - Missing Fields:")
187
+ for field in missing_fields:
188
+ lines.append(f" - {field}")
189
+ else:
190
+ lines.append(f"- Status: {rec}")
191
+ return "\n".join(lines)
192
+
193
+
194
+ class SupervisorAgent:
195
+ """
196
+ Supervisor Agent to orchestrate the agent pipeline in a serial, chained flow:
197
+
198
+ 1. ChecklistCrossReferenceAgent
199
+ 2. AssessmentRecommendationAgent
200
+ 3. OutputFormatterAgent
201
+
202
+ Input:
203
+ submission_data: Pre-processed submission package data.
204
+ Output:
205
+ A final formatted report.
206
+ """
207
+ def __init__(self, checklist):
208
+ self.checklist_agent = ChecklistCrossReferenceAgent(checklist)
209
+ self.assessment_agent = AssessmentRecommendationAgent()
210
+ self.formatter_agent = OutputFormatterAgent()
211
+
212
+ def run(self, submission_data):
213
+ # Step 1: Cross-reference the submission data against the checklist.
214
+ cross_ref_result = self.checklist_agent.run(submission_data)
215
+ # Step 2: Analyze the cross-reference result to produce assessment and recommendations.
216
+ assessment_report = self.assessment_agent.run(cross_ref_result)
217
+ # Step 3: Format the assessment report for display.
218
+ formatted_report = self.formatter_agent.run(assessment_report)
219
+ return formatted_report
220
+
221
+
222
+ # --- Helper Functions for ZIP Processing ---
223
+
224
+ def process_uploaded_zip(uploaded_zip) -> list:
225
+ """
226
+ Processes an uploaded ZIP file (as BytesIO) and returns a list of file dictionaries.
227
+ Each dictionary contains:
228
+ - filename: name of the file.
229
+ - file_type: determined from the extension.
230
+ - content: extracted text content.
231
+ - metadata: additional metadata (currently empty).
232
+ For PDF files, uses LlamaParse for parsing.
233
+ For TXT files, reads the text directly.
234
+ """
235
+ submission_data = []
236
+
237
+ # Open the uploaded zip file from the BytesIO buffer.
238
+ with ZipFile(uploaded_zip) as zip_ref:
239
+ for filename in zip_ref.namelist():
240
+ file_ext = os.path.splitext(filename)[1].lower()
241
+ # Read file bytes
242
+ file_bytes = zip_ref.read(filename)
243
+ content = ""
244
+ if file_ext == ".pdf":
245
+ # Create a temporary file for the PDF
246
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
247
+ tmp.write(file_bytes)
248
+ tmp.flush()
249
+ tmp_path = tmp.name
250
+ # Determine number of workers based on file size (in MB)
251
+ file_size = os.path.getsize(tmp_path) / (1024 * 1024)
252
+ workers = 2 if file_size > 2 else 1
253
+ # Initialize LlamaParse and extract content
254
+ parser = LlamaParse(
255
+ api_key=os.getenv("LLAMA_CLOUD_API_KEY"),
256
+ result_type="markdown",
257
+ num_workers=workers,
258
+ verbose=True
259
+ )
260
+ try:
261
+ # Load and parse the PDF file
262
+ llama_documents = parser.load_data(tmp_path)
263
+ # Aggregate text from parsed documents
264
+ content = "\n".join([doc.text for doc in llama_documents])
265
+ except Exception as e:
266
+ content = f"Error parsing PDF: {str(e)}"
267
+ finally:
268
+ os.remove(tmp_path)
269
+ elif file_ext == ".txt":
270
+ # Decode text content from bytes
271
+ try:
272
+ content = file_bytes.decode("utf-8")
273
+ except UnicodeDecodeError:
274
+ content = file_bytes.decode("latin1")
275
+ else:
276
+ # Skip unsupported file types
277
+ continue
278
+
279
+ submission_data.append({
280
+ "filename": filename,
281
+ "file_type": file_ext.replace(".", ""),
282
+ "content": content,
283
+ "metadata": {}
284
+ })
285
+ return submission_data
286
+
287
+
288
+ # --- Streamlit Interface ---
289
+
290
+ def main():
291
+ st.title("Submission Package Assessment")
292
+ st.write(
293
+ """
294
+ Upload a ZIP file containing your submission package.
295
+ The ZIP file can include PDF and text files.
296
+ """
297
+ )
298
+
299
+ uploaded_file = st.file_uploader("Choose a ZIP file", type=["zip"])
300
+
301
+ if uploaded_file is not None:
302
+ try:
303
+ # Process the uploaded ZIP file to extract submission data
304
+ submission_data = process_uploaded_zip(uploaded_file)
305
+ st.success("File processed successfully!")
306
+
307
+ # Display a summary of the extracted files
308
+ st.subheader("Extracted Files")
309
+ for file_info in submission_data:
310
+ st.write(f"**{file_info['filename']}** - ({file_info['file_type'].upper()})")
311
+
312
+ # Instantiate and run the SupervisorAgent
313
+ supervisor = SupervisorAgent(IND_CHECKLIST)
314
+ assessment_report = supervisor.run(submission_data)
315
+
316
+ st.subheader("Assessment Report")
317
+ st.markdown(assessment_report)
318
+ except Exception as e:
319
+ st.error(f"Error processing file: {str(e)}")
320
+
321
+
322
+ if __name__ == "__main__":
323
+ # To run with Streamlit, use: streamlit run submission_assessment.py
324
+ main()
template.md ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 1. Pre-IND Meeting Preparation
2
+ Request a Pre-IND Meeting: Schedule a meeting with the FDA to discuss your IND submission.
3
+
4
+ Prepare Meeting Package: Include proposed clinical trial design, preclinical data, manufacturing information, and any other relevant data.
5
+
6
+ Submit Questions: Prepare a list of specific questions for the FDA regarding your IND submission.
7
+
8
+ 2. Form FDA 1571
9
+ Complete Form FDA 1571: Ensure all sections are filled out accurately, including sponsor information, drug information, and clinical trial details.
10
+
11
+ Signature: Obtain the required signature from the sponsor or authorized representative.
12
+
13
+ 3. Table of Contents
14
+ Create a Comprehensive Table of Contents: Organize the IND submission with clear sections and page numbers for easy navigation.
15
+
16
+ 4. Introductory Statement and General Investigational Plan
17
+ Introductory Statement: Provide a brief overview of the drug, including its name, structure, and pharmacological class.
18
+
19
+ General Investigational Plan: Outline the clinical development plan, including the objectives and duration of the proposed studies.
20
+
21
+ 5. Investigator's Brochure
22
+ Compile the Investigator's Brochure: Include all relevant information about the drug, such as its formulation, pharmacology, toxicology, and clinical data.
23
+
24
+ Update as Necessary: Ensure the brochure is up-to-date with the latest data.
25
+
26
+ 6. Clinical Protocol
27
+ Develop Clinical Protocol: Detail the study design, including objectives, patient population, dosing regimen, and endpoints.
28
+
29
+ Inclusion/Exclusion Criteria: Clearly define the criteria for patient selection.
30
+
31
+ Safety Monitoring: Outline the procedures for monitoring patient safety.
32
+
33
+ 7. Chemistry, Manufacturing, and Control (CMC) Information
34
+ Drug Substance Information: Provide details on the drug substance, including its manufacture, characterization, and controls.
35
+
36
+ Drug Product Information: Include information on the drug product, such as formulation, manufacturing process, and specifications.
37
+
38
+ Stability Data: Submit stability data to support the proposed shelf life of the drug.
39
+
40
+ Labeling: Provide draft labeling for the investigational drug.
41
+
42
+ 8. Pharmacology and Toxicology Data
43
+ Pharmacology Studies: Submit data from in vitro and in vivo studies that demonstrate the drug's pharmacological effects.
44
+
45
+ Toxicology Studies: Include data from acute, subacute, and chronic toxicity studies, as well as reproductive and genotoxicity studies.
46
+
47
+ Safety Pharmacology: Provide data on the drug's effects on vital organ systems.
48
+
49
+ 9. Previous Human Experience
50
+ Summarize Previous Human Experience: If applicable, include data from previous clinical trials or use in humans.
51
+
52
+ Safety and Efficacy Data: Highlight any relevant safety and efficacy findings from prior studies.
53
+
54
+ 10. Additional Information
55
+ Environmental Assessment: Submit an environmental assessment or claim an exclusion if applicable.
56
+
57
+ Special Considerations: Include any additional information that may be relevant, such as data from pediatric studies or risk management plans.
58
+
59
+ 11. Review and Quality Control
60
+ Internal Review: Conduct a thorough internal review of the IND submission to ensure accuracy and completeness.
61
+
62
+ Quality Control: Verify that all data and documents meet regulatory standards and guidelines.
63
+
64
+ 12. Submission to FDA
65
+ Compile the IND Submission: Assemble all sections into a single, well-organized submission.
66
+
67
+ Submit to FDA: Send the IND submission to the appropriate FDA division via the required submission method (e.g., electronic submission).
68
+
69
+ Confirmation of Receipt: Obtain confirmation from the FDA that the IND has been received and is under review.
70
+
71
+
72
+