drwlf commited on
Commit
01bc500
·
1 Parent(s): 5a347d8

Add medical PDF ingestion Gradio app with RAG capabilities

Browse files
Files changed (4) hide show
  1. README.md +32 -10
  2. app.py +99 -0
  3. ingest.py +477 -0
  4. requirements.txt +11 -0
README.md CHANGED
@@ -1,12 +1,34 @@
1
- ---
2
- title: Medical Pdf Ingestion
3
- emoji: 📊
4
- colorFrom: blue
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 5.39.0
8
- app_file: app.py
9
- pinned: false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # PDF Ingest and Query System
2
+
3
+ This Gradio Space provides a powerful PDF ingestion and querying interface for building a searchable document library.
4
+
5
+ ## Features
6
+
7
+ - **PDF Upload & Ingestion**: Upload PDF files and extract text and images using unstructured.io
8
+ - **Intelligent Chunking**: Automatically chunks documents for optimal retrieval
9
+ - **Vector Embeddings**: Uses BAAI/bge-m3 model for high-quality text embeddings
10
+ - **Image Processing**: Extracts and embeds images using CLIP models
11
+ - **Deduplication**: Prevents duplicate ingestion of the same files
12
+ - **Semantic Search**: Query your document library using natural language
13
+
14
+ ## Usage
15
+
16
+ 1. **Upload PDFs**: Use the file upload interface to add PDF documents to your library
17
+ 2. **Ingest Documents**: Click "Ingest PDFs" to process and add them to the vector database
18
+ 3. **Query Library**: Use natural language queries to search through your ingested documents
19
+
20
+ ## Technical Details
21
+
22
+ - **Vector Database**: ChromaDB for efficient similarity search
23
+ - **Text Embeddings**: BAAI/bge-m3 (768-dimensional)
24
+ - **Image Embeddings**: CLIP ViT-B/32 (512-dimensional)
25
+ - **PDF Processing**: unstructured.io for robust document parsing
26
+ - **UI Framework**: Gradio for interactive web interface
27
+
28
+ ## Requirements
29
+
30
+ This space requires significant computational resources for embedding generation and may take time to process large documents.
31
+
32
  ---
33
 
34
+ Built with ❤️ using Hugging Face Transformers, ChromaDB, and Gradio.
app.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import subprocess
4
+ import shutil
5
+ from pathlib import Path
6
+ import time
7
+
8
+ # Function to handle file upload and ingestion
9
+ def upload_and_ingest(uploaded_file):
10
+ if uploaded_file is None:
11
+ return "No file uploaded."
12
+
13
+ try:
14
+ # Create the pdf_docs directory if it doesn't exist
15
+ pdf_docs_dir = "/home/tony/pdf_docs"
16
+ os.makedirs(pdf_docs_dir, exist_ok=True)
17
+
18
+ # Copy uploaded file to pdf_docs directory
19
+ filename = os.path.basename(uploaded_file.name)
20
+ file_path = os.path.join(pdf_docs_dir, filename)
21
+ shutil.copy2(uploaded_file.name, file_path)
22
+
23
+ # Run the ingestion script and capture output
24
+ result = subprocess.run(
25
+ ["python", "/home/tony/ingest.py"],
26
+ cwd="/home/tony",
27
+ capture_output=True,
28
+ text=True
29
+ )
30
+
31
+ if result.returncode == 0:
32
+ return f"✅ File '{filename}' uploaded and ingested successfully!\n\nIngestion Output:\n{result.stdout}"
33
+ else:
34
+ return f"❌ Error during ingestion:\n{result.stderr}\n\nStdout:\n{result.stdout}"
35
+
36
+ except Exception as e:
37
+ return f"❌ Error: {str(e)}"
38
+
39
+ # Function to handle Google Drive folder link (placeholder for now)
40
+ def link_gdrive_folder(folder_link):
41
+ if not folder_link or not folder_link.strip():
42
+ return "Please provide a Google Drive folder link."
43
+
44
+ # TODO: Implement Google Drive integration
45
+ return f"🚧 Google Drive integration coming soon!\nFolder link: {folder_link}"
46
+
47
+ # Create Gradio Interface
48
+ with gr.Blocks(title="PDF Ingestion Tool", theme=gr.themes.Soft()) as demo:
49
+ gr.Markdown("# 📚 PDF Ingestion Tool")
50
+ gr.Markdown("Upload PDF files or link Google Drive folders to ingest into the medical knowledge base.")
51
+
52
+ with gr.Tab("File Upload"):
53
+ with gr.Row():
54
+ file_input = gr.File(
55
+ label="Upload PDF File",
56
+ file_types=[".pdf"],
57
+ type="filepath"
58
+ )
59
+
60
+ upload_btn = gr.Button("Upload & Ingest", variant="primary")
61
+ upload_output = gr.Textbox(
62
+ label="Ingestion Status",
63
+ lines=10,
64
+ max_lines=20,
65
+ show_copy_button=True
66
+ )
67
+
68
+ upload_btn.click(
69
+ fn=upload_and_ingest,
70
+ inputs=[file_input],
71
+ outputs=[upload_output],
72
+ show_progress=True
73
+ )
74
+
75
+ with gr.Tab("Google Drive"):
76
+ with gr.Row():
77
+ gdrive_input = gr.Textbox(
78
+ label="Google Drive Folder Link",
79
+ placeholder="https://drive.google.com/drive/folders/...",
80
+ lines=1
81
+ )
82
+
83
+ gdrive_btn = gr.Button("Link & Ingest", variant="primary")
84
+ gdrive_output = gr.Textbox(
85
+ label="Status",
86
+ lines=10,
87
+ max_lines=20,
88
+ show_copy_button=True
89
+ )
90
+
91
+ gdrive_btn.click(
92
+ fn=link_gdrive_folder,
93
+ inputs=[gdrive_input],
94
+ outputs=[gdrive_output],
95
+ show_progress=True
96
+ )
97
+
98
+ if __name__ == "__main__":
99
+ demo.launch(server_name="0.0.0.0", server_port=7860, share=True)
ingest.py ADDED
@@ -0,0 +1,477 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ PDF Document Ingestion Script
4
+
5
+ This script processes complex PDF documents (like medical textbooks), extracts text and images,
6
+ chunks them intelligently, generates vector embeddings using state-of-the-art local models,
7
+ and stores them in a local ChromaDB vector database.
8
+
9
+ Author: Expert Python Developer
10
+ Python Version: 3.9+
11
+ """
12
+
13
+ import os
14
+ import uuid
15
+ import hashlib
16
+ from pathlib import Path
17
+ from typing import List, Dict, Any, Optional, Tuple
18
+ import logging
19
+
20
+ # Third-party imports
21
+ from tqdm import tqdm
22
+ from sentence_transformers import SentenceTransformer
23
+ import chromadb
24
+ from chromadb.config import Settings
25
+ from unstructured.partition.pdf import partition_pdf
26
+ from PIL import Image
27
+ import io
28
+
29
+ # Configure logging
30
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
31
+ logger = logging.getLogger(__name__)
32
+
33
+ # =============================================================================
34
+ # CONFIGURATION SECTION
35
+ # =============================================================================
36
+
37
+ # Input/Output Paths
38
+ SOURCE_DIRECTORY = "/home/tony/pdf_docs" # Directory containing PDF files to process
39
+ DB_PATH = "/home/tony/chromadb" # Path for persistent ChromaDB database
40
+ IMAGE_OUTPUT_DIRECTORY = "/home/tony/extracted_images" # Path for storing extracted images
41
+
42
+ # Model Configuration
43
+ TEXT_EMBEDDING_MODEL = "BAAI/bge-m3" # State-of-the-art text embedding model
44
+ IMAGE_EMBEDDING_MODEL = "clip-ViT-B-32" # CLIP model for image embeddings
45
+
46
+ # Database Configuration
47
+ COLLECTION_NAME = "medical_library" # ChromaDB collection name
48
+
49
+ # Processing Configuration
50
+ BATCH_SIZE = 100 # Number of chunks to process in each batch
51
+ MAX_CHUNK_SIZE = 1000 # Maximum characters per text chunk
52
+
53
+ # =============================================================================
54
+ # INITIALIZATION FUNCTIONS
55
+ # =============================================================================
56
+
57
+ def initialize_chromadb() -> Tuple[chromadb.Client, chromadb.Collection]:
58
+ """
59
+ Initialize and return the ChromaDB client and collection.
60
+
61
+ Returns:
62
+ Tuple[chromadb.Client, chromadb.Collection]: The client and collection objects
63
+ """
64
+ try:
65
+ # Ensure database directory exists
66
+ os.makedirs(DB_PATH, exist_ok=True)
67
+
68
+ # Initialize ChromaDB client with persistent storage
69
+ client = chromadb.PersistentClient(
70
+ path=DB_PATH,
71
+ settings=Settings(
72
+ anonymized_telemetry=False,
73
+ allow_reset=True
74
+ )
75
+ )
76
+
77
+ # Get or create collection
78
+ try:
79
+ collection = client.get_collection(name=COLLECTION_NAME)
80
+ logger.info(f"Using existing collection: {COLLECTION_NAME}")
81
+ except chromadb.errors.NotFoundError:
82
+ collection = client.create_collection(
83
+ name=COLLECTION_NAME,
84
+ metadata={"description": "Medical textbook PDF content with embeddings"}
85
+ )
86
+ logger.info(f"Created new collection: {COLLECTION_NAME}")
87
+
88
+ return client, collection
89
+
90
+ except Exception as e:
91
+ logger.error(f"Failed to initialize ChromaDB: {e}")
92
+ raise
93
+
94
+
95
+ def initialize_models() -> Tuple[SentenceTransformer, SentenceTransformer]:
96
+ """
97
+ Load and return the text and image embedding models.
98
+
99
+ Returns:
100
+ Tuple[SentenceTransformer, SentenceTransformer]: Text and image models
101
+ """
102
+ try:
103
+ logger.info("Loading text embedding model...")
104
+ text_model = SentenceTransformer(TEXT_EMBEDDING_MODEL)
105
+
106
+ logger.info("Loading image embedding model...")
107
+ image_model = SentenceTransformer(IMAGE_EMBEDDING_MODEL)
108
+
109
+ logger.info("Models loaded successfully!")
110
+ return text_model, image_model
111
+
112
+ except Exception as e:
113
+ logger.error(f"Failed to load models: {e}")
114
+ raise
115
+
116
+
117
+ def ensure_directories() -> None:
118
+ """
119
+ Ensure all required directories exist.
120
+ """
121
+ try:
122
+ os.makedirs(SOURCE_DIRECTORY, exist_ok=True)
123
+ os.makedirs(IMAGE_OUTPUT_DIRECTORY, exist_ok=True)
124
+ os.makedirs(DB_PATH, exist_ok=True)
125
+ logger.info("All directories verified/created successfully")
126
+
127
+ except Exception as e:
128
+ logger.error(f"Failed to create directories: {e}")
129
+ raise
130
+
131
+
132
+ # =============================================================================
133
+ # DEDUPLICATION FUNCTIONS
134
+ # =============================================================================
135
+
136
+ def calculate_file_hash(file_path: str) -> str:
137
+ """
138
+ Calculate SHA-256 hash of a file for deduplication.
139
+
140
+ Args:
141
+ file_path (str): Path to the file
142
+
143
+ Returns:
144
+ str: SHA-256 hash of the file
145
+ """
146
+ hash_sha256 = hashlib.sha256()
147
+ with open(file_path, "rb") as f:
148
+ for chunk in iter(lambda: f.read(4096), b""):
149
+ hash_sha256.update(chunk)
150
+ return hash_sha256.hexdigest()
151
+
152
+
153
+ def is_pdf_already_processed(pdf_path: str, collection: chromadb.Collection) -> bool:
154
+ """
155
+ Check if a PDF has already been processed by checking its hash in the database.
156
+
157
+ Args:
158
+ pdf_path (str): Path to the PDF file
159
+ collection (chromadb.Collection): ChromaDB collection
160
+
161
+ Returns:
162
+ bool: True if already processed, False otherwise
163
+ """
164
+ try:
165
+ file_hash = calculate_file_hash(pdf_path)
166
+
167
+ # Query the collection for any document with this file hash
168
+ result = collection.get(where={"file_hash": file_hash}, limit=1)
169
+ if len(result['ids']) > 0:
170
+ pdf_filename = Path(pdf_path).name
171
+ logger.info(f"PDF {pdf_filename} already processed (hash: {file_hash[:12]}...). Skipping.")
172
+ return True
173
+ return False
174
+ except Exception as e:
175
+ logger.warning(f"Error checking if PDF is already processed: {e}")
176
+ return False
177
+
178
+
179
+ # =============================================================================
180
+ # DOCUMENT PROCESSING FUNCTIONS
181
+ # =============================================================================
182
+
183
+ def process_pdf(
184
+ pdf_path: str,
185
+ text_model: SentenceTransformer,
186
+ image_model: SentenceTransformer,
187
+ collection: chromadb.Collection
188
+ ) -> None:
189
+ """
190
+ Process a single PDF file and store chunks in ChromaDB.
191
+
192
+ Args:
193
+ pdf_path (str): Path to the PDF file
194
+ text_model (SentenceTransformer): Text embedding model
195
+ image_model (SentenceTransformer): Image embedding model
196
+ collection (chromadb.Collection): ChromaDB collection
197
+ """
198
+ try:
199
+ pdf_filename = Path(pdf_path).name
200
+ logger.info(f"Processing PDF: {pdf_filename}")
201
+
202
+ # Calculate file hash for deduplication
203
+ file_hash = calculate_file_hash(pdf_path)
204
+
205
+ # Parse PDF with unstructured
206
+ elements = partition_pdf(
207
+ filename=pdf_path,
208
+ strategy="hi_res",
209
+ extract_images_in_pdf=True,
210
+ infer_table_structure=True
211
+ )
212
+
213
+ if not elements:
214
+ logger.warning(f"No elements extracted from {pdf_filename}")
215
+ return
216
+
217
+ # Generate chunks from elements
218
+ chunks = create_chunks_from_elements(elements, pdf_filename, file_hash)
219
+
220
+ if not chunks:
221
+ logger.warning(f"No chunks created from {pdf_filename}")
222
+ return
223
+
224
+ # Process chunks in batches
225
+ process_chunks_in_batches(chunks, text_model, image_model, collection)
226
+
227
+ logger.info(f"Successfully processed {pdf_filename}: {len(chunks)} chunks")
228
+
229
+ except Exception as e:
230
+ logger.error(f"Error processing PDF {pdf_path}: {e}")
231
+ raise
232
+
233
+
234
+ def create_chunks_from_elements(elements: List, pdf_filename: str, file_hash: str) -> List[Dict[str, Any]]:
235
+ """
236
+ Create chunks from unstructured elements (let unstructured handle the intelligent parsing).
237
+
238
+ Args:
239
+ elements (List): List of unstructured elements
240
+ pdf_filename (str): Name of the source PDF file
241
+ file_hash (str): SHA-256 hash of the PDF file for deduplication
242
+
243
+ Returns:
244
+ List[Dict[str, Any]]: List of chunk dictionaries
245
+ """
246
+ chunks = []
247
+
248
+ for i, element in enumerate(elements):
249
+ try:
250
+ element_type = element.category
251
+ page_number = getattr(element.metadata, 'page_number', 1)
252
+
253
+ # Handle image elements
254
+ if element_type == "Image" and hasattr(element, 'image_bytes'):
255
+ # Save image and create image chunk
256
+ image_path = save_image(element.image_bytes, pdf_filename, i)
257
+ if image_path:
258
+ chunks.append({
259
+ 'id': f"{pdf_filename}_img_{i}",
260
+ 'content': image_path,
261
+ 'type': 'image',
262
+ 'metadata': {
263
+ 'source_file': pdf_filename,
264
+ 'page_number': page_number,
265
+ 'element_type': element_type,
266
+ 'image_path': image_path,
267
+ 'file_hash': file_hash
268
+ }
269
+ })
270
+
271
+ # Handle all text elements as individual chunks (unstructured already did the intelligent parsing)
272
+ else:
273
+ text_content = str(element).strip()
274
+ if text_content and len(text_content) > 20: # Skip very short fragments
275
+ chunks.append({
276
+ 'id': f"{pdf_filename}_text_{i}",
277
+ 'content': text_content,
278
+ 'type': 'text',
279
+ 'metadata': {
280
+ 'source_file': pdf_filename,
281
+ 'page_number': page_number,
282
+ 'element_type': element_type,
283
+ 'file_hash': file_hash
284
+ }
285
+ })
286
+
287
+ except Exception as e:
288
+ logger.warning(f"Error processing element {i}: {e}")
289
+ continue
290
+
291
+ return chunks
292
+
293
+
294
+ def save_image(image_bytes: bytes, pdf_filename: str, chunk_index: int) -> Optional[str]:
295
+ """
296
+ Save image bytes to file and return the path.
297
+
298
+ Args:
299
+ image_bytes (bytes): Raw image data
300
+ pdf_filename (str): Source PDF filename
301
+ chunk_index (int): Index of the chunk
302
+
303
+ Returns:
304
+ Optional[str]: Path to saved image or None if failed
305
+ """
306
+ try:
307
+ # Create unique filename
308
+ image_filename = f"{Path(pdf_filename).stem}_{chunk_index}_{uuid.uuid4().hex[:8]}.png"
309
+ image_path = os.path.join(IMAGE_OUTPUT_DIRECTORY, image_filename)
310
+
311
+ # Convert and save image
312
+ image = Image.open(io.BytesIO(image_bytes))
313
+ image.save(image_path, format='PNG')
314
+
315
+ return image_path
316
+
317
+ except Exception as e:
318
+ logger.warning(f"Failed to save image: {e}")
319
+ return None
320
+
321
+
322
+ def process_chunks_in_batches(
323
+ chunks: List[Dict[str, Any]],
324
+ text_model: SentenceTransformer,
325
+ image_model: SentenceTransformer,
326
+ collection: chromadb.Collection
327
+ ) -> None:
328
+ """
329
+ Process chunks in batches and store in ChromaDB.
330
+
331
+ Args:
332
+ chunks (List[Dict[str, Any]]): List of chunks to process
333
+ text_model (SentenceTransformer): Text embedding model
334
+ image_model (SentenceTransformer): Image embedding model
335
+ collection (chromadb.Collection): ChromaDB collection
336
+ """
337
+ for i in range(0, len(chunks), BATCH_SIZE):
338
+ batch = chunks[i:i + BATCH_SIZE]
339
+
340
+ try:
341
+ process_batch(batch, text_model, image_model, collection)
342
+ except Exception as e:
343
+ logger.error(f"Error processing batch {i//BATCH_SIZE + 1}: {e}")
344
+ # Continue with next batch instead of failing completely
345
+ continue
346
+
347
+
348
+ def process_batch(
349
+ batch: List[Dict[str, Any]],
350
+ text_model: SentenceTransformer,
351
+ image_model: SentenceTransformer,
352
+ collection: chromadb.Collection
353
+ ) -> None:
354
+ """
355
+ Process a single batch of chunks.
356
+
357
+ Args:
358
+ batch (List[Dict[str, Any]]): Batch of chunks to process
359
+ text_model (SentenceTransformer): Text embedding model
360
+ image_model (SentenceTransformer): Image embedding model
361
+ collection (chromadb.Collection): ChromaDB collection
362
+ """
363
+ ids = []
364
+ embeddings = []
365
+ metadatas = []
366
+ documents = []
367
+
368
+ for chunk in batch:
369
+ try:
370
+ chunk_id = chunk['id']
371
+ content = chunk['content']
372
+ chunk_type = chunk['type']
373
+ metadata = chunk['metadata']
374
+
375
+ # Generate embedding based on type
376
+ if chunk_type == 'text':
377
+ embedding = text_model.encode(content).tolist()
378
+ document = content
379
+ elif chunk_type == 'image':
380
+ # For images, encode the image file
381
+ if os.path.exists(content):
382
+ embedding = image_model.encode(Image.open(content)).tolist()
383
+ document = f"Image from {metadata['source_file']} page {metadata['page_number']}"
384
+ else:
385
+ logger.warning(f"Image file not found: {content}")
386
+ continue
387
+ else:
388
+ logger.warning(f"Unknown chunk type: {chunk_type}")
389
+ continue
390
+
391
+ ids.append(chunk_id)
392
+ embeddings.append(embedding)
393
+ metadatas.append(metadata)
394
+ documents.append(document)
395
+
396
+ except Exception as e:
397
+ logger.warning(f"Error processing chunk {chunk.get('id', 'unknown')}: {e}")
398
+ continue
399
+
400
+ # Add batch to collection
401
+ if ids:
402
+ try:
403
+ collection.add(
404
+ ids=ids,
405
+ embeddings=embeddings,
406
+ metadatas=metadatas,
407
+ documents=documents
408
+ )
409
+ logger.debug(f"Added batch of {len(ids)} chunks to database")
410
+ except Exception as e:
411
+ logger.error(f"Error adding batch to database: {e}")
412
+ raise
413
+
414
+
415
+ # =============================================================================
416
+ # MAIN EXECUTION
417
+ # =============================================================================
418
+
419
+ def main():
420
+ """
421
+ Main execution function.
422
+ """
423
+ try:
424
+ logger.info("Starting PDF ingestion process...")
425
+
426
+ # Ensure directories exist
427
+ ensure_directories()
428
+
429
+ # Initialize models and database
430
+ logger.info("Initializing models and database...")
431
+ text_model, image_model = initialize_models()
432
+ client, collection = initialize_chromadb()
433
+
434
+ # Get list of PDF files
435
+ pdf_files = []
436
+ if os.path.exists(SOURCE_DIRECTORY):
437
+ pdf_files = [f for f in os.listdir(SOURCE_DIRECTORY) if f.lower().endswith('.pdf')]
438
+
439
+ if not pdf_files:
440
+ logger.warning(f"No PDF files found in {SOURCE_DIRECTORY}")
441
+ logger.info("Please add PDF files to the source directory and run again.")
442
+ return
443
+
444
+ logger.info(f"Found {len(pdf_files)} PDF files to process")
445
+
446
+ # Process each PDF file with progress bar
447
+ with tqdm(pdf_files, desc="Processing PDFs") as pbar:
448
+ for pdf_file in pbar:
449
+ pdf_path = os.path.join(SOURCE_DIRECTORY, pdf_file)
450
+ pbar.set_description(f"Processing {pdf_file}")
451
+
452
+ # Check if this PDF has already been processed
453
+ if is_pdf_already_processed(pdf_path, collection):
454
+ continue
455
+
456
+ try:
457
+ process_pdf(pdf_path, text_model, image_model, collection)
458
+ except Exception as e:
459
+ logger.error(f"Failed to process {pdf_file}: {e}")
460
+ continue
461
+
462
+ # Get final statistics
463
+ try:
464
+ count = collection.count()
465
+ logger.info(f"Ingestion complete! Total chunks in database: {count}")
466
+ except Exception as e:
467
+ logger.warning(f"Could not get final count: {e}")
468
+
469
+ logger.info("PDF ingestion process completed successfully!")
470
+
471
+ except Exception as e:
472
+ logger.error(f"Fatal error in main execution: {e}")
473
+ raise
474
+
475
+
476
+ if __name__ == "__main__":
477
+ main()
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio==5.39.0
2
+ transformers==4.49.1
3
+ torch>=2.0.0
4
+ chromadb==0.5.2
5
+ sentence-transformers==3.4.0
6
+ unstructured[all-docs]==0.18.5
7
+ pillow>=10.0.0
8
+ numpy>=1.24.0
9
+ pandas>=2.0.0
10
+ tqdm>=4.65.0
11
+ clip-by-openai