andreasmartin commited on
Commit
718e159
·
1 Parent(s): 6c9d07b

deepnote update

Browse files
Files changed (3) hide show
  1. app.py +14 -16
  2. faq.py +20 -14
  3. util.py +17 -18
app.py CHANGED
@@ -5,6 +5,7 @@ import util as util
5
  import uvicorn
6
  import gradio as gr
7
  from typing import List, Optional
 
8
 
9
  app = FastAPI()
10
 
@@ -13,15 +14,9 @@ class AskRequest(BaseModel):
13
  question: str
14
  sheet_url: str
15
  page_content_column: str
16
- k: int
17
-
18
-
19
- class AskRequestEx(BaseModel):
20
- question: str
21
- sheet_url: str
22
- page_content_column: str
23
- k: int
24
- id_column: str
25
  synonyms: Optional[List[List[str]]] = None
26
 
27
 
@@ -33,15 +28,17 @@ async def ask_api(request: AskRequest):
33
 
34
 
35
  @app.post("/api/v2/ask")
36
- async def ask_api(request: AskRequestEx):
37
- util.SPLIT_PAGE_BREAKS = True
 
38
  if request.synonyms is not None:
39
  util.SYNONYMS = request.synonyms
40
  vectordb = faq.load_vectordb(request.sheet_url, request.page_content_column)
41
  documents = faq.similarity_search(vectordb, request.question, k=request.k)
42
  df_doc = util.transform_documents_to_dataframe(documents)
43
- df_filter = util.remove_duplicates_by_column(df_doc, request.id_column)
44
- return util.dataframe_to_dict(df_filter)
 
45
 
46
 
47
  @app.delete("/api/v1/")
@@ -52,8 +49,9 @@ async def delete_vectordb_api():
52
  def ask(sheet_url: str, page_content_column: str, k: int, question: str):
53
  util.SPLIT_PAGE_BREAKS = False
54
  vectordb = faq.load_vectordb(sheet_url, page_content_column)
55
- result = faq.similarity_search(vectordb, question, k=k)
56
- return result
 
57
 
58
 
59
  def delete_vectordb():
@@ -63,7 +61,7 @@ def delete_vectordb():
63
  with gr.Blocks() as block:
64
  sheet_url = gr.Textbox(label="Google Sheet URL")
65
  page_content_column = gr.Textbox(label="Question Column")
66
- k = gr.Slider(2, 5, step=1, label="K")
67
  question = gr.Textbox(label="Question")
68
  ask_button = gr.Button("Ask")
69
  answer_output = gr.JSON(label="Answer")
 
5
  import uvicorn
6
  import gradio as gr
7
  from typing import List, Optional
8
+ from fastapi.responses import JSONResponse
9
 
10
  app = FastAPI()
11
 
 
14
  question: str
15
  sheet_url: str
16
  page_content_column: str
17
+ k: int = 20
18
+ reload_collection: Optional[bool] = None
19
+ id_column: Optional[str] = None
 
 
 
 
 
 
20
  synonyms: Optional[List[List[str]]] = None
21
 
22
 
 
28
 
29
 
30
  @app.post("/api/v2/ask")
31
+ async def ask_api(request: AskRequest):
32
+ if request.id_column is not None:
33
+ util.SPLIT_PAGE_BREAKS = True
34
  if request.synonyms is not None:
35
  util.SYNONYMS = request.synonyms
36
  vectordb = faq.load_vectordb(request.sheet_url, request.page_content_column)
37
  documents = faq.similarity_search(vectordb, request.question, k=request.k)
38
  df_doc = util.transform_documents_to_dataframe(documents)
39
+ if request.id_column is not None:
40
+ df_doc = util.remove_duplicates_by_column(df_doc, request.id_column)
41
+ return JSONResponse(util.dataframe_to_dict(df_doc))
42
 
43
 
44
  @app.delete("/api/v1/")
 
49
  def ask(sheet_url: str, page_content_column: str, k: int, question: str):
50
  util.SPLIT_PAGE_BREAKS = False
51
  vectordb = faq.load_vectordb(sheet_url, page_content_column)
52
+ documents = faq.similarity_search(vectordb, question, k=k)
53
+ df_doc = util.transform_documents_to_dataframe(documents)
54
+ return util.dataframe_to_dict(df_doc)
55
 
56
 
57
  def delete_vectordb():
 
61
  with gr.Blocks() as block:
62
  sheet_url = gr.Textbox(label="Google Sheet URL")
63
  page_content_column = gr.Textbox(label="Question Column")
64
+ k = gr.Slider(1, 30, step=1, label="K")
65
  question = gr.Textbox(label="Question")
66
  ask_button = gr.Button("Ask")
67
  answer_output = gr.JSON(label="Answer")
faq.py CHANGED
@@ -32,7 +32,7 @@ def define_embedding_function(model_name: str) -> HuggingFaceEmbeddings:
32
 
33
 
34
  def get_vectordb(
35
- faq_id: str,
36
  embedding_function: Embeddings,
37
  documents: List[Document] = None,
38
  vectordb_type: str = VECTORDB_TYPE,
@@ -44,31 +44,32 @@ def get_vectordb(
44
  vectordb = AwaDB(
45
  embedding=embedding_function, log_and_data_dir=VECTORDB_FOLDER
46
  )
47
- if not vectordb.load_local(table_name=faq_id):
48
- raise Exception("faq_id may not exists")
49
  else:
50
  vectordb = AwaDB.from_documents(
51
  documents=documents,
52
  embedding=embedding_function,
53
- table_name=faq_id,
54
  log_and_data_dir=VECTORDB_FOLDER,
55
  )
56
  if vectordb_type is VECTORDB_TYPES.Chroma:
57
  if documents is None:
58
  vectordb = Chroma(
59
- collection_name=faq_id,
60
  embedding_function=embedding_function,
61
  persist_directory=VECTORDB_FOLDER,
62
  )
63
  if not vectordb.get()["ids"]:
64
- raise Exception("faq_id may not exists")
65
  else:
66
  vectordb = Chroma.from_documents(
67
  documents=documents,
68
  embedding=embedding_function,
69
- collection_name=faq_id,
70
  persist_directory=VECTORDB_FOLDER,
71
  )
 
72
  return vectordb
73
 
74
 
@@ -80,33 +81,33 @@ def similarity_search(
80
 
81
 
82
  def load_vectordb_id(
83
- faq_id: str,
84
  page_content_column: str,
85
  embedding_function_name: str = EMBEDDING_MODEL,
86
  ) -> VectorStore:
87
  embedding_function = define_embedding_function(embedding_function_name)
88
  vectordb = None
89
  try:
90
- vectordb = get_vectordb(faq_id=faq_id, embedding_function=embedding_function)
91
  except Exception as e:
92
  print(e)
93
- vectordb = create_vectordb_id(faq_id, page_content_column, embedding_function)
94
 
95
  return vectordb
96
 
97
 
98
  def create_vectordb_id(
99
- faq_id: str,
100
  page_content_column: str,
101
  embedding_function: HuggingFaceEmbeddings = None,
102
  ) -> VectorStore:
103
  if embedding_function is None:
104
  embedding_function = define_embedding_function(EMBEDDING_MODEL)
105
 
106
- df = util.read_df(util.xlsx_url(faq_id), page_content_column)
107
  documents = create_documents(df, page_content_column)
108
  vectordb = get_vectordb(
109
- faq_id=faq_id, embedding_function=embedding_function, documents=documents
110
  )
111
  return vectordb
112
 
@@ -115,5 +116,10 @@ def load_vectordb(sheet_url: str, page_content_column: str) -> VectorStore:
115
  return load_vectordb_id(util.get_id(sheet_url), page_content_column)
116
 
117
 
118
- def delete_vectordb():
119
  shutil.rmtree(VECTORDB_FOLDER, ignore_errors=True)
 
 
 
 
 
 
32
 
33
 
34
  def get_vectordb(
35
+ collection_id: str,
36
  embedding_function: Embeddings,
37
  documents: List[Document] = None,
38
  vectordb_type: str = VECTORDB_TYPE,
 
44
  vectordb = AwaDB(
45
  embedding=embedding_function, log_and_data_dir=VECTORDB_FOLDER
46
  )
47
+ if not vectordb.load_local(table_name=collection_id):
48
+ raise Exception("collection_id may not exists")
49
  else:
50
  vectordb = AwaDB.from_documents(
51
  documents=documents,
52
  embedding=embedding_function,
53
+ table_name=collection_id,
54
  log_and_data_dir=VECTORDB_FOLDER,
55
  )
56
  if vectordb_type is VECTORDB_TYPES.Chroma:
57
  if documents is None:
58
  vectordb = Chroma(
59
+ collection_name=collection_id,
60
  embedding_function=embedding_function,
61
  persist_directory=VECTORDB_FOLDER,
62
  )
63
  if not vectordb.get()["ids"]:
64
+ raise Exception("collection_id may not exists")
65
  else:
66
  vectordb = Chroma.from_documents(
67
  documents=documents,
68
  embedding=embedding_function,
69
+ collection_name=collection_id,
70
  persist_directory=VECTORDB_FOLDER,
71
  )
72
+ vectordb.persist()
73
  return vectordb
74
 
75
 
 
81
 
82
 
83
  def load_vectordb_id(
84
+ collection_id: str,
85
  page_content_column: str,
86
  embedding_function_name: str = EMBEDDING_MODEL,
87
  ) -> VectorStore:
88
  embedding_function = define_embedding_function(embedding_function_name)
89
  vectordb = None
90
  try:
91
+ vectordb = get_vectordb(collection_id=collection_id, embedding_function=embedding_function)
92
  except Exception as e:
93
  print(e)
94
+ vectordb = create_vectordb_id(collection_id, page_content_column, embedding_function)
95
 
96
  return vectordb
97
 
98
 
99
  def create_vectordb_id(
100
+ collection_id: str,
101
  page_content_column: str,
102
  embedding_function: HuggingFaceEmbeddings = None,
103
  ) -> VectorStore:
104
  if embedding_function is None:
105
  embedding_function = define_embedding_function(EMBEDDING_MODEL)
106
 
107
+ df = util.read_df(util.xlsx_url(collection_id), page_content_column)
108
  documents = create_documents(df, page_content_column)
109
  vectordb = get_vectordb(
110
+ collection_id=collection_id, embedding_function=embedding_function, documents=documents
111
  )
112
  return vectordb
113
 
 
116
  return load_vectordb_id(util.get_id(sheet_url), page_content_column)
117
 
118
 
119
+ def delete_vectordb() -> None:
120
  shutil.rmtree(VECTORDB_FOLDER, ignore_errors=True)
121
+
122
+
123
+ def delete_vectordb_current_collection(vectordb: VectorStore) -> None:
124
+ vectordb.delete_collection()
125
+ vectordb.persist()
util.py CHANGED
@@ -1,4 +1,5 @@
1
  import pandas as pd
 
2
 
3
  SHEET_URL_X = "https://docs.google.com/spreadsheets/d/"
4
  SHEET_URL_Y = "/edit#gid="
@@ -27,7 +28,7 @@ def read_df(xlsx_url: str, page_content_column: str) -> pd.DataFrame:
27
  return df
28
 
29
 
30
- def split_page_breaks(df, column_name):
31
  split_values = df[column_name].str.split("\n")
32
 
33
  new_df = pd.DataFrame({column_name: split_values.explode()})
@@ -46,37 +47,35 @@ def split_page_breaks(df, column_name):
46
  return new_df
47
 
48
 
49
- def transform_documents_to_dataframe(documents):
50
- metadata_keys = set()
51
- for doc, _ in documents:
52
- metadata_keys.update(doc.metadata.keys())
53
 
54
- metadata_values = {key: [] for key in metadata_keys}
55
- for doc, _ in documents:
56
  for key, value in doc.metadata.items():
57
- metadata_values[key].append(value)
 
 
 
 
 
58
 
59
- metadata_values["Score"] = [score for _, score in documents]
60
 
61
- df = pd.DataFrame(metadata_values)
62
 
63
- return df
64
-
65
-
66
- def remove_duplicates_by_column(df, column):
67
- df.drop_duplicates(subset=column, inplace=True)
68
- df.reset_index(drop=True, inplace=True)
69
 
70
  return df
71
 
72
 
73
- def dataframe_to_dict(df):
74
  df_records = df.to_dict(orient="records")
75
 
76
  return df_records
77
 
78
 
79
- def duplicate_rows_with_synonyms(df, column, synonyms):
80
  new_rows = []
81
  for index, row in df.iterrows():
82
  new_rows.append(row)
 
1
  import pandas as pd
2
+ from langchain.docstore.document import Document
3
 
4
  SHEET_URL_X = "https://docs.google.com/spreadsheets/d/"
5
  SHEET_URL_Y = "/edit#gid="
 
28
  return df
29
 
30
 
31
+ def split_page_breaks(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
32
  split_values = df[column_name].str.split("\n")
33
 
34
  new_df = pd.DataFrame({column_name: split_values.explode()})
 
47
  return new_df
48
 
49
 
50
+ def transform_documents_to_dataframe(documents: Document) -> pd.DataFrame:
51
+ keys = []
52
+ values = {"document_score": [], "page_content": []}
 
53
 
54
+ for doc, score in documents:
 
55
  for key, value in doc.metadata.items():
56
+ if key not in keys:
57
+ keys.append(key)
58
+ values[key] = []
59
+ values[key].append(value)
60
+ values["document_score"].append(score)
61
+ values["page_content"].append(doc.page_content)
62
 
63
+ return pd.DataFrame(values)
64
 
 
65
 
66
+ def remove_duplicates_by_column(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
67
+ df.drop_duplicates(subset=column_name, inplace=True, ignore_index=True)
 
 
 
 
68
 
69
  return df
70
 
71
 
72
+ def dataframe_to_dict(df: pd.DataFrame) -> dict:
73
  df_records = df.to_dict(orient="records")
74
 
75
  return df_records
76
 
77
 
78
+ def duplicate_rows_with_synonyms(df: pd.DataFrame, column: str, synonyms: list[list[str]]) -> pd.DataFrame:
79
  new_rows = []
80
  for index, row in df.iterrows():
81
  new_rows.append(row)