import pandas as pd from langchain.document_loaders import DataFrameLoader from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import AwaDB from typing import List, Tuple from langchain.docstore.document import Document from langchain.embeddings.base import Embeddings from langchain.vectorstores.base import VectorStore import os sheet_url_x = "https://docs.google.com/spreadsheets/d/" sheet_url_y = "/edit#gid=" sheet_url_y_exp = "/export?gid=" cache_folder=".embedding-model" dir_vectordb = ".vectordb" def faq_id(sheet_url: str) -> str: x = sheet_url.find(sheet_url_x) y = sheet_url.find(sheet_url_y) return sheet_url[x + len(sheet_url_x) : y] + "-" + sheet_url[y + len(sheet_url_y) :] def xlsx_url(sheet_url: str) -> str: return sheet_url.replace(sheet_url_y, sheet_url_y_exp) def xlsx_url_faq_id(faq_id: str) -> str: y = faq_id.rfind("-") return sheet_url_x + faq_id[0:y] + sheet_url_y_exp + faq_id[y + 1 :] def read_df(xlsx_url: str) -> pd.DataFrame: return pd.read_excel(xlsx_url, header=0, keep_default_na=False) def create_documents(df: pd.DataFrame, page_content_column: str) -> pd.DataFrame: loader = DataFrameLoader(df, page_content_column=page_content_column) return loader.load() def embedding_function(model_name: str) -> HuggingFaceEmbeddings: return HuggingFaceEmbeddings( model_name=model_name, encode_kwargs={"normalize_embeddings": True}, cache_folder=cache_folder ) def vectordb( faq_id: str, documents: List[Document], embedding_function: Embeddings, init: bool = False, ) -> VectorStore: vectordb = None if init: vectordb = AwaDB.from_documents( documents=documents, embedding=embedding_function, table_name=faq_id, log_and_data_dir=dir_vectordb ) else: vectordb = AwaDB( embedding=embedding_function, log_and_data_dir=dir_vectordb ) vectordb.load_local(table_name=faq_id) return vectordb def similarity_search(vectordb: VectorStore, query: str, k: int) -> List[Tuple[Document, float]]: os.environ["TOKENIZERS_PARALLELISM"] = "true" return vectordb.similarity_search_with_relevance_scores(query=query, k=k)