Spaces:
Building
Building
Dirk Haupt
commited on
Commit
·
5c73b25
1
Parent(s):
5ddd201
use only one tasklist
Browse files
app.py
CHANGED
@@ -71,10 +71,14 @@ def get_vectorstore(persist_dir: str = "vector_store"):
|
|
71 |
return _vectorstore
|
72 |
|
73 |
async def process_and_load_documents(vectorstore, repo_id="Frikster42/name-that-trait", data_folder="data"):
|
|
|
|
|
|
|
|
|
|
|
74 |
msg = cl.Message(content="Loading documents from Hugging Face repository... please be patient...")
|
75 |
await msg.send()
|
76 |
|
77 |
-
# Create data directory if it doesn't exist
|
78 |
data_dir = Path("data")
|
79 |
data_dir.mkdir(exist_ok=True)
|
80 |
|
@@ -84,14 +88,12 @@ async def process_and_load_documents(vectorstore, repo_id="Frikster42/name-that-
|
|
84 |
dataset_pdf_files = [f for f in dataset_files if f.endswith('.pdf')]
|
85 |
|
86 |
# Download phase
|
87 |
-
|
88 |
-
|
89 |
-
await download_tasks.send()
|
90 |
|
91 |
for i, pdf_file in enumerate(dataset_pdf_files):
|
92 |
task = cl.Task(title=f"Downloading {pdf_file}")
|
93 |
-
await
|
94 |
-
await download_tasks.send()
|
95 |
|
96 |
hf_hub_download(
|
97 |
repo_id=repo_id,
|
@@ -102,22 +104,18 @@ async def process_and_load_documents(vectorstore, repo_id="Frikster42/name-that-
|
|
102 |
)
|
103 |
|
104 |
task.status = cl.TaskStatus.DONE
|
105 |
-
await
|
106 |
|
107 |
-
await download_tasks.remove() # Clear the download tasks before moving to next phase
|
108 |
-
|
109 |
# Loading phase
|
110 |
documents = []
|
111 |
pdf_files = [f for f in os.listdir(data_folder) if f.endswith('.pdf')]
|
112 |
|
113 |
-
|
114 |
-
|
115 |
-
await loading_tasks.send()
|
116 |
|
117 |
for i, filename in enumerate(pdf_files):
|
118 |
task = cl.Task(title=f"Loading {filename}")
|
119 |
-
await
|
120 |
-
await loading_tasks.send()
|
121 |
|
122 |
filepath = os.path.join(data_folder, filename)
|
123 |
if filename.endswith('.pdf'):
|
@@ -128,33 +126,30 @@ async def process_and_load_documents(vectorstore, repo_id="Frikster42/name-that-
|
|
128 |
loader = TextLoader(filepath)
|
129 |
documents.extend(loader.load())
|
130 |
task.status = cl.TaskStatus.DONE
|
131 |
-
await
|
132 |
-
|
133 |
-
await loading_tasks.remove() # Clear loading tasks before moving to next phase
|
134 |
|
135 |
# Split and process documents
|
136 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
137 |
chunks = text_splitter.split_documents(documents)
|
138 |
|
139 |
if chunks:
|
140 |
-
|
141 |
-
|
142 |
-
await processing_tasks.send()
|
143 |
|
144 |
batch_size = 100
|
145 |
num_batches = (len(chunks) + batch_size - 1) // batch_size
|
146 |
|
147 |
for i in range(0, len(chunks), batch_size):
|
148 |
task = cl.Task(title=f"Processing batch {(i//batch_size)+1}/{num_batches}")
|
149 |
-
await
|
150 |
-
await processing_tasks.send()
|
151 |
|
152 |
batch = chunks[i:i + batch_size]
|
153 |
vectorstore.add_documents(batch)
|
154 |
task.status = cl.TaskStatus.DONE
|
155 |
-
await
|
156 |
-
|
157 |
-
|
|
|
158 |
|
159 |
msg = cl.Message(content="✅ Documents loaded successfully!")
|
160 |
await msg.send()
|
|
|
71 |
return _vectorstore
|
72 |
|
73 |
async def process_and_load_documents(vectorstore, repo_id="Frikster42/name-that-trait", data_folder="data"):
|
74 |
+
# Create a single TaskList for the entire process
|
75 |
+
tasks = cl.TaskList()
|
76 |
+
tasks.status = "Initializing..."
|
77 |
+
await tasks.send()
|
78 |
+
|
79 |
msg = cl.Message(content="Loading documents from Hugging Face repository... please be patient...")
|
80 |
await msg.send()
|
81 |
|
|
|
82 |
data_dir = Path("data")
|
83 |
data_dir.mkdir(exist_ok=True)
|
84 |
|
|
|
88 |
dataset_pdf_files = [f for f in dataset_files if f.endswith('.pdf')]
|
89 |
|
90 |
# Download phase
|
91 |
+
tasks.status = "Downloading files..."
|
92 |
+
await tasks.send()
|
|
|
93 |
|
94 |
for i, pdf_file in enumerate(dataset_pdf_files):
|
95 |
task = cl.Task(title=f"Downloading {pdf_file}")
|
96 |
+
await tasks.add_task(task)
|
|
|
97 |
|
98 |
hf_hub_download(
|
99 |
repo_id=repo_id,
|
|
|
104 |
)
|
105 |
|
106 |
task.status = cl.TaskStatus.DONE
|
107 |
+
await tasks.send()
|
108 |
|
|
|
|
|
109 |
# Loading phase
|
110 |
documents = []
|
111 |
pdf_files = [f for f in os.listdir(data_folder) if f.endswith('.pdf')]
|
112 |
|
113 |
+
tasks.status = "Loading files..."
|
114 |
+
await tasks.send()
|
|
|
115 |
|
116 |
for i, filename in enumerate(pdf_files):
|
117 |
task = cl.Task(title=f"Loading {filename}")
|
118 |
+
await tasks.add_task(task)
|
|
|
119 |
|
120 |
filepath = os.path.join(data_folder, filename)
|
121 |
if filename.endswith('.pdf'):
|
|
|
126 |
loader = TextLoader(filepath)
|
127 |
documents.extend(loader.load())
|
128 |
task.status = cl.TaskStatus.DONE
|
129 |
+
await tasks.send()
|
|
|
|
|
130 |
|
131 |
# Split and process documents
|
132 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
133 |
chunks = text_splitter.split_documents(documents)
|
134 |
|
135 |
if chunks:
|
136 |
+
tasks.status = "Processing chunks..."
|
137 |
+
await tasks.send()
|
|
|
138 |
|
139 |
batch_size = 100
|
140 |
num_batches = (len(chunks) + batch_size - 1) // batch_size
|
141 |
|
142 |
for i in range(0, len(chunks), batch_size):
|
143 |
task = cl.Task(title=f"Processing batch {(i//batch_size)+1}/{num_batches}")
|
144 |
+
await tasks.add_task(task)
|
|
|
145 |
|
146 |
batch = chunks[i:i + batch_size]
|
147 |
vectorstore.add_documents(batch)
|
148 |
task.status = cl.TaskStatus.DONE
|
149 |
+
await tasks.send()
|
150 |
+
|
151 |
+
tasks.status = "Completed"
|
152 |
+
await tasks.send()
|
153 |
|
154 |
msg = cl.Message(content="✅ Documents loaded successfully!")
|
155 |
await msg.send()
|