Spaces:

Ankitajadhav
/

Whats_Cooking

Runtime error

App Files Files Community

Ankitajadhav commited on Jul 6, 2024

Commit

5ecd97e

verified ·

1 Parent(s): 94005ba

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -21

app.py CHANGED Viewed

@@ -19,33 +19,45 @@ class VectorStore:
         self.collection = self.chroma_client.create_collection(name=collection_name)
     # Method to populate the vector store with embeddings from a dataset
-    def populate_vectors(self, dataset):
-        # Select the text columns to concatenate
-        title = dataset['train']['title_cleaned'][:2500]  # Limiting to 100 examples for the demo
-        recipe = dataset['train']['recipe_new'][:2500]
-        meal_type = dataset['train']['meal_type'][:2500]
-        allergy = dataset['train']['allergy_type'][:2500]
-        ingredients_alternative = dataset['train']['ingredients_alternatives'][:2500]
-        # Concatenate the text from both columns
-        texts = [f"{tit} {rep} {meal} {alle} {ingr} " for tit, rep, meal,alle, ingr in zip(title,recipe,meal_type,allergy,ingredients_alternative)]
-        for i, item in enumerate(texts):
-            embeddings = self.embedding_model.encode(item).tolist()
-            self.collection.add(embeddings=[embeddings], documents=[item], ids=[str(i)])
-    # # Method to search the ChromaDB collection for relevant context based on a query
     def search_context(self, query, n_results=1):
         query_embeddings = self.embedding_model.encode(query).tolist()
         return self.collection.query(query_embeddings=query_embeddings, n_results=n_results)
-# importing dataset hosted on huggingface
-# dataset details - https://huggingface.co/datasets/Thefoodprocessor/recipe_new_with_features_full
-dataset = load_dataset('Thefoodprocessor/recipe_new_with_features_full')
 # create a vector embedding
 vector_store = VectorStore("embedding_vector")
-vector_store.populate_vectors(dataset)
 # Load the model and tokenizer

         self.collection = self.chroma_client.create_collection(name=collection_name)
     # Method to populate the vector store with embeddings from a dataset
+    def populate_vectors(self, dataset, batch_size=100):
+        # Use dataset streaming
+        dataset = load_dataset('Thefoodprocessor/recipe_new_with_features_full', split='train', streaming=True)
+        # Process in batches
+        texts = []
+        for i, example in enumerate(dataset):
+            title = example['title_cleaned']
+            recipe = example['recipe_new']
+            meal_type = example['meal_type']
+            allergy = example['allergy_type']
+            ingredients_alternative = example['ingredients_alternatives']
+            # Concatenate the text from the columns
+            text = f"{title} {recipe} {meal_type} {allergy} {ingredients_alternative}"
+            texts.append(text)
+            # Process the batch
+            if (i + 1) % batch_size == 0:
+                self._process_batch(texts, i)
+                texts = []
+        # Process the remaining texts
+        if texts:
+            self._process_batch(texts, i)
+    def _process_batch(self, texts, batch_start_idx):
+        embeddings = self.embedding_model.encode(texts, batch_size=len(texts)).tolist()
+        for j, embedding in enumerate(embeddings):
+            self.collection.add(embeddings=[embedding], documents=[texts[j]], ids=[str(batch_start_idx + j)])
     def search_context(self, query, n_results=1):
         query_embeddings = self.embedding_model.encode(query).tolist()
         return self.collection.query(query_embeddings=query_embeddings, n_results=n_results)
 # create a vector embedding
 vector_store = VectorStore("embedding_vector")
+vector_store.populate_vectors(dataset=None)
 # Load the model and tokenizer