Spaces:

HIT-TMG
/

KaLM-Embedding

Running

App Files Files Community

YanshekWoo commited on Feb 6

Commit

c9d8253

verified ·

1 Parent(s): 2b145d6

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

app.py +21 -14

app.py CHANGED Viewed

@@ -19,7 +19,7 @@ file_example = """Please upload a JSON file with a "text" field (with optional "
     {"title": "Title B", "text": "This an example text with the title"},
 ]
 ```
-Due to the computation resources, please test with small scale data.
 """
@@ -42,6 +42,12 @@ def upload_file_fn(
     try:
         with open(file_path) as f:
             document_data = json.load(f)
         documents = []
         for obj in document_data:
             text = obj["title"] + "\n" + obj["text"] if obj.get("title") else obj["text"]
@@ -55,18 +61,13 @@ def upload_file_fn(
         gr.Error(str(e))
         return None, gr.update(interactive=False)
-    if len(documents) < 3:
-        gr.Error("Please upload more than 3 documents.")
         return None, gr.update(interactive=False)
-    gr.Info(f"Upload {len(documents)} documents.")
-    if len(documents) > 1000:
-        gr.Info(f"Cut uploaded documents to 1000 due to the computation resource.")
-        documents = documents[: 1000]
     # documents_embeddings = model.encode(documents, show_progress_bar=True)
     documents_embeddings = []
-    batch_size = 8
     for i in tqdm(range(0, len(documents), batch_size)):
         batch_documents = documents[i: i+batch_size]
         batch_embeddings = model.encode(batch_documents, show_progress_bar=True)
@@ -87,7 +88,7 @@ def clear_file_fn():
 def retrieve_document_fn(question, document_states, instruct):
-    num_retrieval_doc = 3
     if document_states is None:
         gr.Warning("Please upload documents first!")
@@ -95,11 +96,16 @@ def retrieve_document_fn(question, document_states, instruct):
     document_data, document_index = document_states["document_data"], document_states["document_index"]
-    question_embedding = model.encode([str(instruct) + str(question)])
     batch_scores, batch_inxs = document_index.search(question_embedding, k=min(len(document_data), 150))
     answers = [document_data[i]["text"] for i in batch_inxs[0][:num_retrieval_doc]]
-    return answers[0], answers[1], answers[2], document_states
 def main(args):
@@ -126,9 +132,10 @@ def main(args):
         retrieval_interface = gr.Interface(
             fn=retrieve_document_fn,
             inputs=[gr.Textbox(label="Query"), document_state],
-            outputs=[gr.Text(label="Recall-1"), gr.Text(label="Recall-2"),  gr.Text(label="Recall-3"), gr.State()],
             additional_inputs=[gr.Textbox("Instruct: Given a query, retrieve documents that answer the query. \n Query: ", label="Instruct of Query", lines=2)],
             concurrency_limit=1,
         )
         # retrieval_interface.input_components[0] = gr.update(interactive=False)
@@ -153,7 +160,7 @@ def main(args):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--model_name_or_path", type=str, default="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5")
-    parser.add_argument("--revision", type=str, default="refs/pr/2")
     args = parser.parse_args()
     main(args)

     {"title": "Title B", "text": "This an example text with the title"},
 ]
 ```
+Due to the computation resources, please test with small scale data (<1000).
 """
     try:
         with open(file_path) as f:
             document_data = json.load(f)
+        gr.Info(f"Upload {len(document_data)} documents.")
+        if len(document_data) > 1000:
+            gr.Info(f"Cut uploaded documents to 1000 due to the computation resource.")
+            document_data = document_data[: 1000]
         documents = []
         for obj in document_data:
             text = obj["title"] + "\n" + obj["text"] if obj.get("title") else obj["text"]
         gr.Error(str(e))
         return None, gr.update(interactive=False)
+    if len(documents) < 5:
+        gr.Error("Please upload more than 53 documents.")
         return None, gr.update(interactive=False)
     # documents_embeddings = model.encode(documents, show_progress_bar=True)
     documents_embeddings = []
+    batch_size = 16
     for i in tqdm(range(0, len(documents), batch_size)):
         batch_documents = documents[i: i+batch_size]
         batch_embeddings = model.encode(batch_documents, show_progress_bar=True)
 def retrieve_document_fn(question, document_states, instruct):
+    num_retrieval_doc = 5
     if document_states is None:
         gr.Warning("Please upload documents first!")
     document_data, document_index = document_states["document_data"], document_states["document_index"]
+    question_with_inst = str(instruct) + str(question)
+    if len(question_with_inst.strip()) == 0:
+        gr.Warning("Please enter a non-empty query.")
+        return None, None, None, None, None, document_states
+    question_embedding = model.encode([question_with_inst])
     batch_scores, batch_inxs = document_index.search(question_embedding, k=min(len(document_data), 150))
     answers = [document_data[i]["text"] for i in batch_inxs[0][:num_retrieval_doc]]
+    return answers[0], answers[1], answers[2], answers[3], answers[4],document_states
 def main(args):
         retrieval_interface = gr.Interface(
             fn=retrieve_document_fn,
             inputs=[gr.Textbox(label="Query"), document_state],
+            outputs=[gr.Text(label="Recall-1"), gr.Text(label="Recall-2"),  gr.Text(label="Recall-3"), gr.Text(label="Recall-4"), gr.Text(label="Recall-5"), gr.State()],
             additional_inputs=[gr.Textbox("Instruct: Given a query, retrieve documents that answer the query. \n Query: ", label="Instruct of Query", lines=2)],
             concurrency_limit=1,
+            allow_flagging="never",
         )
         # retrieval_interface.input_components[0] = gr.update(interactive=False)
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--model_name_or_path", type=str, default="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5")
+    parser.add_argument("--revision", type=str, default=None)
     args = parser.parse_args()
     main(args)