andreasmartin commited on
Commit
6c9d07b
·
1 Parent(s): 78aafcc

deepnote update

Browse files
Files changed (3) hide show
  1. app.py +14 -2
  2. faq.py +1 -1
  3. util.py +22 -2
app.py CHANGED
@@ -4,6 +4,7 @@ import faq as faq
4
  import util as util
5
  import uvicorn
6
  import gradio as gr
 
7
 
8
  app = FastAPI()
9
 
@@ -15,6 +16,15 @@ class AskRequest(BaseModel):
15
  k: int
16
 
17
 
 
 
 
 
 
 
 
 
 
18
  @app.post("/api/v1/ask")
19
  async def ask_api(request: AskRequest):
20
  return ask(
@@ -23,12 +33,14 @@ async def ask_api(request: AskRequest):
23
 
24
 
25
  @app.post("/api/v2/ask")
26
- async def ask_api(request: AskRequest):
27
  util.SPLIT_PAGE_BREAKS = True
 
 
28
  vectordb = faq.load_vectordb(request.sheet_url, request.page_content_column)
29
  documents = faq.similarity_search(vectordb, request.question, k=request.k)
30
  df_doc = util.transform_documents_to_dataframe(documents)
31
- df_filter = util.remove_duplicates_by_column(df_doc, "ID")
32
  return util.dataframe_to_dict(df_filter)
33
 
34
 
 
4
  import util as util
5
  import uvicorn
6
  import gradio as gr
7
+ from typing import List, Optional
8
 
9
  app = FastAPI()
10
 
 
16
  k: int
17
 
18
 
19
+ class AskRequestEx(BaseModel):
20
+ question: str
21
+ sheet_url: str
22
+ page_content_column: str
23
+ k: int
24
+ id_column: str
25
+ synonyms: Optional[List[List[str]]] = None
26
+
27
+
28
  @app.post("/api/v1/ask")
29
  async def ask_api(request: AskRequest):
30
  return ask(
 
33
 
34
 
35
  @app.post("/api/v2/ask")
36
+ async def ask_api(request: AskRequestEx):
37
  util.SPLIT_PAGE_BREAKS = True
38
+ if request.synonyms is not None:
39
+ util.SYNONYMS = request.synonyms
40
  vectordb = faq.load_vectordb(request.sheet_url, request.page_content_column)
41
  documents = faq.similarity_search(vectordb, request.question, k=request.k)
42
  df_doc = util.transform_documents_to_dataframe(documents)
43
+ df_filter = util.remove_duplicates_by_column(df_doc, request.id_column)
44
  return util.dataframe_to_dict(df_filter)
45
 
46
 
faq.py CHANGED
@@ -103,7 +103,7 @@ def create_vectordb_id(
103
  if embedding_function is None:
104
  embedding_function = define_embedding_function(EMBEDDING_MODEL)
105
 
106
- df = util.read_df(util.xlsx_url(faq_id))
107
  documents = create_documents(df, page_content_column)
108
  vectordb = get_vectordb(
109
  faq_id=faq_id, embedding_function=embedding_function, documents=documents
 
103
  if embedding_function is None:
104
  embedding_function = define_embedding_function(EMBEDDING_MODEL)
105
 
106
+ df = util.read_df(util.xlsx_url(faq_id), page_content_column)
107
  documents = create_documents(df, page_content_column)
108
  vectordb = get_vectordb(
109
  faq_id=faq_id, embedding_function=embedding_function, documents=documents
util.py CHANGED
@@ -4,6 +4,7 @@ SHEET_URL_X = "https://docs.google.com/spreadsheets/d/"
4
  SHEET_URL_Y = "/edit#gid="
5
  SHEET_URL_Y_EXPORT = "/export?gid="
6
  SPLIT_PAGE_BREAKS = False
 
7
 
8
 
9
  def get_id(sheet_url: str) -> str:
@@ -17,10 +18,12 @@ def xlsx_url(get_id: str) -> str:
17
  return SHEET_URL_X + get_id[0:y] + SHEET_URL_Y_EXPORT + get_id[y + 1 :]
18
 
19
 
20
- def read_df(xlsx_url: str, split_page_breaks: bool = SPLIT_PAGE_BREAKS) -> pd.DataFrame:
21
  df = pd.read_excel(xlsx_url, header=0, keep_default_na=False)
22
- if split_page_breaks:
23
  df = split_page_breaks(df, page_content_column)
 
 
24
  return df
25
 
26
 
@@ -71,3 +74,20 @@ def dataframe_to_dict(df):
71
  df_records = df.to_dict(orient="records")
72
 
73
  return df_records
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  SHEET_URL_Y = "/edit#gid="
5
  SHEET_URL_Y_EXPORT = "/export?gid="
6
  SPLIT_PAGE_BREAKS = False
7
+ SYNONYMS = None
8
 
9
 
10
  def get_id(sheet_url: str) -> str:
 
18
  return SHEET_URL_X + get_id[0:y] + SHEET_URL_Y_EXPORT + get_id[y + 1 :]
19
 
20
 
21
+ def read_df(xlsx_url: str, page_content_column: str) -> pd.DataFrame:
22
  df = pd.read_excel(xlsx_url, header=0, keep_default_na=False)
23
+ if SPLIT_PAGE_BREAKS:
24
  df = split_page_breaks(df, page_content_column)
25
+ if SYNONYMS is not None:
26
+ df = duplicate_rows_with_synonyms(df, page_content_column, SYNONYMS)
27
  return df
28
 
29
 
 
74
  df_records = df.to_dict(orient="records")
75
 
76
  return df_records
77
+
78
+
79
+ def duplicate_rows_with_synonyms(df, column, synonyms):
80
+ new_rows = []
81
+ for index, row in df.iterrows():
82
+ new_rows.append(row)
83
+ for synonym_list in synonyms:
84
+ for word in row[column].split():
85
+ if word in synonym_list:
86
+ for synonym in synonym_list:
87
+ if synonym != word:
88
+ new_row = row.copy()
89
+ new_row[column] = row[column].replace(word, synonym)
90
+ new_rows.append(new_row)
91
+ new_df = pd.DataFrame(new_rows, columns=df.columns)
92
+ new_df = new_df.reset_index(drop=True)
93
+ return new_df