Spaces:
Sleeping
Sleeping
from typing import Sequence, List, Tuple | |
from models.vectorizer import Vectorizer | |
import numpy as np | |
from sentence_transformers import SentenceTransformer | |
import faiss | |
class PromptSearchEngine: | |
def __init__(self, model_name='bert-base-nli-mean-tokens'): | |
self.model = SentenceTransformer(model_name) | |
# Initialize FAISS index with right number of dimensions | |
self.embedding_dimension = self.model.get_sentence_embedding_dimension() | |
self.index = faiss.IndexFlatL2(self.embedding_dimension) # Euclidian distance index - brute force for small datasets | |
self.prompts_track = [] # To keep track of original prompts for returning results | |
def add_prompts_to_vector_database(self, prompts): | |
embeddings = self.model.encode(prompts) | |
self.index.add(np.array(embeddings).astype('float32')) | |
self.prompts_track.extend(prompts) | |
def most_similar(self, query, top_k=5): | |
# Encode the query | |
query_embedding = self.model.encode([query]).astype('float32') | |
# Optimizovana pretraga ali moramo promeniti vrstu indeksa | |
distances, indices = self.index.search(query_embedding, top_k) | |
# Retrieve the corresponding prompts for the found indices | |
similar_prompts = [self.prompts_track[idx] for idx in indices[0]] | |
return similar_prompts, distances[0] # Return both the similar prompts and their distances | |
def cosine_similarity(query_vector: np.ndarray, corpus_vectors: np.ndarray) -> np.ndarray: | |
"""Compute the cosine similarity between a query vector and a set of corpus vectors. | |
Args: query_vector: The query vector to compare against the corpus vectors. corpus_vectors: The set of corpus vectors to compare against the query vector. | |
Returns: The cosine similarity between the query vector and the corpus vectors. | |
""" | |
similarities = {} | |
for index, vector in enumerate(corpus_vectors): | |
if np.linalg.norm(vector) == 0: | |
raise ValueError("One of the corpus vectors has zero norm.") | |
cos_similarity = np.dot(vector, query_vector) / (np.linalg.norm(vector) * np.linalg.norm(query_vector)) | |
similarities[index] = cos_similarity | |
return similarities | |