Dirk Haupt commited on
Commit
5c73b25
·
1 Parent(s): 5ddd201

use only one tasklist

Browse files
Files changed (1) hide show
  1. app.py +20 -25
app.py CHANGED
@@ -71,10 +71,14 @@ def get_vectorstore(persist_dir: str = "vector_store"):
71
  return _vectorstore
72
 
73
  async def process_and_load_documents(vectorstore, repo_id="Frikster42/name-that-trait", data_folder="data"):
 
 
 
 
 
74
  msg = cl.Message(content="Loading documents from Hugging Face repository... please be patient...")
75
  await msg.send()
76
 
77
- # Create data directory if it doesn't exist
78
  data_dir = Path("data")
79
  data_dir.mkdir(exist_ok=True)
80
 
@@ -84,14 +88,12 @@ async def process_and_load_documents(vectorstore, repo_id="Frikster42/name-that-
84
  dataset_pdf_files = [f for f in dataset_files if f.endswith('.pdf')]
85
 
86
  # Download phase
87
- download_tasks = cl.TaskList()
88
- download_tasks.status = "Downloading files..."
89
- await download_tasks.send()
90
 
91
  for i, pdf_file in enumerate(dataset_pdf_files):
92
  task = cl.Task(title=f"Downloading {pdf_file}")
93
- await download_tasks.add_task(task)
94
- await download_tasks.send()
95
 
96
  hf_hub_download(
97
  repo_id=repo_id,
@@ -102,22 +104,18 @@ async def process_and_load_documents(vectorstore, repo_id="Frikster42/name-that-
102
  )
103
 
104
  task.status = cl.TaskStatus.DONE
105
- await download_tasks.send()
106
 
107
- await download_tasks.remove() # Clear the download tasks before moving to next phase
108
-
109
  # Loading phase
110
  documents = []
111
  pdf_files = [f for f in os.listdir(data_folder) if f.endswith('.pdf')]
112
 
113
- loading_tasks = cl.TaskList()
114
- loading_tasks.status = "Loading files..."
115
- await loading_tasks.send()
116
 
117
  for i, filename in enumerate(pdf_files):
118
  task = cl.Task(title=f"Loading {filename}")
119
- await loading_tasks.add_task(task)
120
- await loading_tasks.send()
121
 
122
  filepath = os.path.join(data_folder, filename)
123
  if filename.endswith('.pdf'):
@@ -128,33 +126,30 @@ async def process_and_load_documents(vectorstore, repo_id="Frikster42/name-that-
128
  loader = TextLoader(filepath)
129
  documents.extend(loader.load())
130
  task.status = cl.TaskStatus.DONE
131
- await loading_tasks.send()
132
-
133
- await loading_tasks.remove() # Clear loading tasks before moving to next phase
134
 
135
  # Split and process documents
136
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
137
  chunks = text_splitter.split_documents(documents)
138
 
139
  if chunks:
140
- processing_tasks = cl.TaskList()
141
- processing_tasks.status = "Processing chunks..."
142
- await processing_tasks.send()
143
 
144
  batch_size = 100
145
  num_batches = (len(chunks) + batch_size - 1) // batch_size
146
 
147
  for i in range(0, len(chunks), batch_size):
148
  task = cl.Task(title=f"Processing batch {(i//batch_size)+1}/{num_batches}")
149
- await processing_tasks.add_task(task)
150
- await processing_tasks.send()
151
 
152
  batch = chunks[i:i + batch_size]
153
  vectorstore.add_documents(batch)
154
  task.status = cl.TaskStatus.DONE
155
- await processing_tasks.send()
156
-
157
- await processing_tasks.remove() # Clear processing tasks when done
 
158
 
159
  msg = cl.Message(content="✅ Documents loaded successfully!")
160
  await msg.send()
 
71
  return _vectorstore
72
 
73
  async def process_and_load_documents(vectorstore, repo_id="Frikster42/name-that-trait", data_folder="data"):
74
+ # Create a single TaskList for the entire process
75
+ tasks = cl.TaskList()
76
+ tasks.status = "Initializing..."
77
+ await tasks.send()
78
+
79
  msg = cl.Message(content="Loading documents from Hugging Face repository... please be patient...")
80
  await msg.send()
81
 
 
82
  data_dir = Path("data")
83
  data_dir.mkdir(exist_ok=True)
84
 
 
88
  dataset_pdf_files = [f for f in dataset_files if f.endswith('.pdf')]
89
 
90
  # Download phase
91
+ tasks.status = "Downloading files..."
92
+ await tasks.send()
 
93
 
94
  for i, pdf_file in enumerate(dataset_pdf_files):
95
  task = cl.Task(title=f"Downloading {pdf_file}")
96
+ await tasks.add_task(task)
 
97
 
98
  hf_hub_download(
99
  repo_id=repo_id,
 
104
  )
105
 
106
  task.status = cl.TaskStatus.DONE
107
+ await tasks.send()
108
 
 
 
109
  # Loading phase
110
  documents = []
111
  pdf_files = [f for f in os.listdir(data_folder) if f.endswith('.pdf')]
112
 
113
+ tasks.status = "Loading files..."
114
+ await tasks.send()
 
115
 
116
  for i, filename in enumerate(pdf_files):
117
  task = cl.Task(title=f"Loading {filename}")
118
+ await tasks.add_task(task)
 
119
 
120
  filepath = os.path.join(data_folder, filename)
121
  if filename.endswith('.pdf'):
 
126
  loader = TextLoader(filepath)
127
  documents.extend(loader.load())
128
  task.status = cl.TaskStatus.DONE
129
+ await tasks.send()
 
 
130
 
131
  # Split and process documents
132
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
133
  chunks = text_splitter.split_documents(documents)
134
 
135
  if chunks:
136
+ tasks.status = "Processing chunks..."
137
+ await tasks.send()
 
138
 
139
  batch_size = 100
140
  num_batches = (len(chunks) + batch_size - 1) // batch_size
141
 
142
  for i in range(0, len(chunks), batch_size):
143
  task = cl.Task(title=f"Processing batch {(i//batch_size)+1}/{num_batches}")
144
+ await tasks.add_task(task)
 
145
 
146
  batch = chunks[i:i + batch_size]
147
  vectorstore.add_documents(batch)
148
  task.status = cl.TaskStatus.DONE
149
+ await tasks.send()
150
+
151
+ tasks.status = "Completed"
152
+ await tasks.send()
153
 
154
  msg = cl.Message(content="✅ Documents loaded successfully!")
155
  await msg.send()