andreasmartin commited on
Commit
4dc1d14
·
unverified ·
1 Parent(s): 36d62ce
Files changed (3) hide show
  1. app.py +1 -1
  2. faq.py +3 -20
  3. util.py +23 -3
app.py CHANGED
@@ -34,7 +34,7 @@ async def ask_api(request: AskRequest):
34
  documents = faq.similarity_search(vectordb, request.question, k=request.k)
35
  df_doc = util.transform_documents_to_dataframe(documents)
36
  df_filter = util.remove_duplicates_by_column(df_doc, "ID")
37
- return util.serialize_dataframe_as_json(df_filter)
38
 
39
 
40
  @app.delete("/api/v1/")
 
34
  documents = faq.similarity_search(vectordb, request.question, k=request.k)
35
  df_doc = util.transform_documents_to_dataframe(documents)
36
  df_filter = util.remove_duplicates_by_column(df_doc, "ID")
37
+ return util.dataframe_to_dict(df_filter)
38
 
39
 
40
  @app.delete("/api/v1/")
faq.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import pandas as pd
2
  from langchain.document_loaders import DataFrameLoader
3
  from langchain.embeddings import HuggingFaceEmbeddings
@@ -10,30 +11,12 @@ import os
10
  import shutil
11
  from enum import Enum
12
 
13
- SHEET_URL_X = "https://docs.google.com/spreadsheets/d/"
14
- SHEET_URL_Y = "/edit#gid="
15
- SHEET_URL_Y_EXPORT = "/export?gid="
16
  EMBEDDING_MODEL_FOLDER = ".embedding-model"
17
  VECTORDB_FOLDER = ".vectordb"
18
  EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
19
  VECTORDB_TYPE = Enum("VECTORDB_TYPE", ["AwaDB", "Chroma"])
20
 
21
 
22
- def faq_id(sheet_url: str) -> str:
23
- x = sheet_url.find(SHEET_URL_X)
24
- y = sheet_url.find(SHEET_URL_Y)
25
- return sheet_url[x + len(SHEET_URL_X) : y] + "-" + sheet_url[y + len(SHEET_URL_Y) :]
26
-
27
-
28
- def xlsx_url(faq_id: str) -> str:
29
- y = faq_id.rfind("-")
30
- return SHEET_URL_X + faq_id[0:y] + SHEET_URL_Y_EXPORT + faq_id[y + 1 :]
31
-
32
-
33
- def read_df(xlsx_url: str) -> pd.DataFrame:
34
- return pd.read_excel(xlsx_url, header=0, keep_default_na=False)
35
-
36
-
37
  def create_documents(df: pd.DataFrame, page_content_column: str) -> pd.DataFrame:
38
  loader = DataFrameLoader(df, page_content_column=page_content_column)
39
  return loader.load()
@@ -109,7 +92,7 @@ def create_vectordb_id(
109
  if embedding_function is None:
110
  embedding_function = define_embedding_function(EMBEDDING_MODEL)
111
 
112
- df = read_df(xlsx_url(faq_id))
113
  documents = create_documents(df, page_content_column)
114
  vectordb = get_vectordb(
115
  faq_id=faq_id, embedding_function=embedding_function, documents=documents
@@ -118,7 +101,7 @@ def create_vectordb_id(
118
 
119
 
120
  def load_vectordb(sheet_url: str, page_content_column: str) -> VectorStore:
121
- return load_vectordb_id(faq_id(sheet_url), page_content_column)
122
 
123
 
124
  def delete_vectordb():
 
1
+ import util as util
2
  import pandas as pd
3
  from langchain.document_loaders import DataFrameLoader
4
  from langchain.embeddings import HuggingFaceEmbeddings
 
11
  import shutil
12
  from enum import Enum
13
 
 
 
 
14
  EMBEDDING_MODEL_FOLDER = ".embedding-model"
15
  VECTORDB_FOLDER = ".vectordb"
16
  EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
17
  VECTORDB_TYPE = Enum("VECTORDB_TYPE", ["AwaDB", "Chroma"])
18
 
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def create_documents(df: pd.DataFrame, page_content_column: str) -> pd.DataFrame:
21
  loader = DataFrameLoader(df, page_content_column=page_content_column)
22
  return loader.load()
 
92
  if embedding_function is None:
93
  embedding_function = define_embedding_function(EMBEDDING_MODEL)
94
 
95
+ df = util.read_df(util.xlsx_url(faq_id))
96
  documents = create_documents(df, page_content_column)
97
  vectordb = get_vectordb(
98
  faq_id=faq_id, embedding_function=embedding_function, documents=documents
 
101
 
102
 
103
  def load_vectordb(sheet_url: str, page_content_column: str) -> VectorStore:
104
+ return load_vectordb_id(util.get_id(sheet_url), page_content_column)
105
 
106
 
107
  def delete_vectordb():
util.py CHANGED
@@ -1,5 +1,25 @@
1
  import pandas as pd
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  def split_page_breaks(df, column_name):
4
  split_values = df[column_name].str.split("\n")
5
 
@@ -43,7 +63,7 @@ def remove_duplicates_by_column(df, column):
43
  return df
44
 
45
 
46
- def serialize_dataframe_as_json(df):
47
- json_array = df.to_dict(orient='records')
48
 
49
- return json_array
 
1
  import pandas as pd
2
 
3
+ SHEET_URL_X = "https://docs.google.com/spreadsheets/d/"
4
+ SHEET_URL_Y = "/edit#gid="
5
+ SHEET_URL_Y_EXPORT = "/export?gid="
6
+
7
+
8
+ def get_id(sheet_url: str) -> str:
9
+ x = sheet_url.find(SHEET_URL_X)
10
+ y = sheet_url.find(SHEET_URL_Y)
11
+ return sheet_url[x + len(SHEET_URL_X) : y] + "-" + sheet_url[y + len(SHEET_URL_Y) :]
12
+
13
+
14
+ def xlsx_url(get_id: str) -> str:
15
+ y = get_id.rfind("-")
16
+ return SHEET_URL_X + get_id[0:y] + SHEET_URL_Y_EXPORT + get_id[y + 1 :]
17
+
18
+
19
+ def read_df(xlsx_url: str) -> pd.DataFrame:
20
+ return pd.read_excel(xlsx_url, header=0, keep_default_na=False)
21
+
22
+
23
  def split_page_breaks(df, column_name):
24
  split_values = df[column_name].str.split("\n")
25
 
 
63
  return df
64
 
65
 
66
+ def dataframe_to_dict(df):
67
+ df_records = df.to_dict(orient='records')
68
 
69
+ return df_records