Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- app.py +128 -0
- requirements.txt +3 -0
- resources/head.html +36 -0
- resources/styles.css +158 -0
app.py
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import argparse
|
3 |
+
from pathlib import Path
|
4 |
+
from typing import List
|
5 |
+
|
6 |
+
import gradio as gr
|
7 |
+
import faiss
|
8 |
+
import numpy as np
|
9 |
+
import torch
|
10 |
+
from sentence_transformers import SentenceTransformer
|
11 |
+
|
12 |
+
|
13 |
+
file_example = """Please upload a JSON file with a "text" field (with optional "title" field). For example
|
14 |
+
```JSON
|
15 |
+
[
|
16 |
+
{"title": "", "text": "This an example text without the title"},
|
17 |
+
{"title": "Title A", "text": "This an example text with the title"},
|
18 |
+
{"title": "Title B", "text": "This an example text with the title"},
|
19 |
+
]
|
20 |
+
```"""
|
21 |
+
|
22 |
+
|
23 |
+
def create_index(embeddings, use_gpu):
|
24 |
+
index = faiss.IndexFlatIP(len(embeddings[0]))
|
25 |
+
embeddings = np.asarray(embeddings, dtype=np.float32)
|
26 |
+
if use_gpu:
|
27 |
+
co = faiss.GpuMultipleClonerOptions()
|
28 |
+
co.shard = True
|
29 |
+
co.useFloat16 = True
|
30 |
+
index = faiss.index_cpu_to_all_gpus(index, co=co)
|
31 |
+
index.add(embeddings)
|
32 |
+
return index
|
33 |
+
|
34 |
+
|
35 |
+
def upload_file_fn(
|
36 |
+
file_path: List[str],
|
37 |
+
progress: gr.Progress = gr.Progress(track_tqdm=True)
|
38 |
+
):
|
39 |
+
try:
|
40 |
+
with open(file_path) as f:
|
41 |
+
document_data = json.load(f)
|
42 |
+
documents = []
|
43 |
+
for obj in document_data:
|
44 |
+
text = obj["title"] + "\n" + obj["text"] if obj.get("title") else obj["text"]
|
45 |
+
documents.append(text)
|
46 |
+
except Exception as e:
|
47 |
+
print(e)
|
48 |
+
gr.Warning("Read the file failed. Please check the data format.")
|
49 |
+
return None, None
|
50 |
+
|
51 |
+
documents_embeddings = model.encode(documents)
|
52 |
+
|
53 |
+
document_index = create_index(documents_embeddings, use_gpu=False)
|
54 |
+
|
55 |
+
if torch.cuda.is_available():
|
56 |
+
torch.cuda.empty_cache()
|
57 |
+
torch.cuda.ipc_collect()
|
58 |
+
|
59 |
+
return document_index, document_data
|
60 |
+
|
61 |
+
|
62 |
+
def clear_file_fn():
|
63 |
+
return None, None
|
64 |
+
|
65 |
+
|
66 |
+
def retrieve_document_fn(question, document_data, document_index):
|
67 |
+
num_retrieval_doc = 3
|
68 |
+
if document_index is None or document_data is None:
|
69 |
+
gr.Warning("Please upload documents first!")
|
70 |
+
return [None for i in range(num_retrieval_doc)]
|
71 |
+
|
72 |
+
question_embedding = model.encode([question])
|
73 |
+
batch_scores, batch_inxs = document_index.search(question_embedding, k=num_retrieval_doc)
|
74 |
+
|
75 |
+
answers = [document_data[i]["text"] for i in batch_inxs[0][:num_retrieval_doc]]
|
76 |
+
return tuple(answers)
|
77 |
+
|
78 |
+
|
79 |
+
def main(args):
|
80 |
+
global model
|
81 |
+
|
82 |
+
model = SentenceTransformer(args.model_name_or_path)
|
83 |
+
|
84 |
+
document_index = gr.State()
|
85 |
+
document_data = gr.State()
|
86 |
+
|
87 |
+
|
88 |
+
with open(Path(__file__).parent / "resources/head.html") as html_file:
|
89 |
+
head = html_file.read().strip()
|
90 |
+
with gr.Blocks(theme=gr.themes.Soft(font="sans-serif").set(background_fill_primary="linear-gradient(90deg, #e3ffe7 0%, #d9e7ff 100%)", background_fill_primary_dark="linear-gradient(90deg, #4b6cb7 0%, #182848 100%)",),
|
91 |
+
head=head,
|
92 |
+
css=Path(__file__).parent / "resources/styles.css",
|
93 |
+
title="KaLM-Embedding",
|
94 |
+
fill_height=True,
|
95 |
+
analytics_enabled=False) as demo:
|
96 |
+
gr.Markdown(file_example)
|
97 |
+
doc_files_box = gr.File(label="Upload Documents", file_types=[".json"], file_count="single")
|
98 |
+
retrieval_interface = gr.Interface(
|
99 |
+
fn=retrieve_document_fn,
|
100 |
+
inputs=["text"],
|
101 |
+
outputs=["text", "text", "text"],
|
102 |
+
additional_inputs=[document_data, document_index],
|
103 |
+
concurrency_limit=1,
|
104 |
+
)
|
105 |
+
|
106 |
+
doc_files_box.upload(
|
107 |
+
upload_file_fn,
|
108 |
+
[doc_files_box],
|
109 |
+
[document_index, document_data],
|
110 |
+
queue=True,
|
111 |
+
trigger_mode="once"
|
112 |
+
)
|
113 |
+
doc_files_box.clear(
|
114 |
+
upload_file_fn,
|
115 |
+
None,
|
116 |
+
[document_index, document_data],
|
117 |
+
queue=True,
|
118 |
+
trigger_mode="once"
|
119 |
+
)
|
120 |
+
demo.launch()
|
121 |
+
|
122 |
+
|
123 |
+
if __name__ == "__main__":
|
124 |
+
parser = argparse.ArgumentParser()
|
125 |
+
parser.add_argument("--model_name_or_path", type=str, default="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5")
|
126 |
+
|
127 |
+
args = parser.parse_args()
|
128 |
+
main(args)
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
transformers==4.39.1
|
2 |
+
sentence-transformers==2.5.1
|
3 |
+
faiss-cpu==1.8.0
|
resources/head.html
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<style>
|
2 |
+
:root {
|
3 |
+
--highlight-background-color-light: #F1EAFF;
|
4 |
+
--highlight-background-color-dark: #3E00FF;
|
5 |
+
}
|
6 |
+
|
7 |
+
body {
|
8 |
+
--highlight-background-color: var(--highlight-background-color-light);
|
9 |
+
}
|
10 |
+
|
11 |
+
@media (prefers-color-scheme: dark) {
|
12 |
+
body {
|
13 |
+
--highlight-background-color: var(--highlight-background-color-dark);
|
14 |
+
}
|
15 |
+
}
|
16 |
+
</style>
|
17 |
+
|
18 |
+
<script>
|
19 |
+
document.addEventListener('click', function(event) {
|
20 |
+
if (event.target.tagName.toLowerCase() === 'a') {
|
21 |
+
var href = event.target.getAttribute('href');
|
22 |
+
|
23 |
+
if (href && href.startsWith('#')) {
|
24 |
+
var targetId = href.substring(1);
|
25 |
+
var targetArticle = document.getElementById(targetId);
|
26 |
+
var articles = document.getElementsByTagName('article');
|
27 |
+
|
28 |
+
for (var i = 0; i < articles.length; i++) {
|
29 |
+
articles[i].style.backgroundColor = '';
|
30 |
+
}
|
31 |
+
|
32 |
+
targetArticle.style.backgroundColor = 'var(--highlight-background-color)';
|
33 |
+
}
|
34 |
+
}
|
35 |
+
});
|
36 |
+
</script>
|
resources/styles.css
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
a {
|
3 |
+
text-decoration: none!important;
|
4 |
+
}
|
5 |
+
|
6 |
+
article {
|
7 |
+
padding: 12px;
|
8 |
+
}
|
9 |
+
|
10 |
+
|
11 |
+
progress::-webkit-progress-bar {
|
12 |
+
border-radius: 8rpx !important;
|
13 |
+
background-color: #f0f0f0;
|
14 |
+
}
|
15 |
+
progress::-webkit-progress-value {
|
16 |
+
border-radius: 8rpx !important;
|
17 |
+
}
|
18 |
+
.progress-1 {
|
19 |
+
color: #FF004D;
|
20 |
+
}
|
21 |
+
.progress-1::-webkit-progress-value {
|
22 |
+
background-color:#FF004D;
|
23 |
+
}
|
24 |
+
.progress-1::-moz-progress-bar {
|
25 |
+
background-color:#FF004D;
|
26 |
+
}
|
27 |
+
.progress-2 {
|
28 |
+
color: #FF8400;
|
29 |
+
}
|
30 |
+
.progress-2::-webkit-progress-value {
|
31 |
+
background-color:#FF8400;
|
32 |
+
}
|
33 |
+
.progress-2::-moz-progress-bar {
|
34 |
+
background-color:#FF8400;
|
35 |
+
}
|
36 |
+
.progress-3 {
|
37 |
+
color: #0079FF;
|
38 |
+
}
|
39 |
+
.progress-3::-webkit-progress-value {
|
40 |
+
background-color:#0079FF;
|
41 |
+
}
|
42 |
+
.progress-3::-moz-progress-bar {
|
43 |
+
background-color:#0079FF;
|
44 |
+
}
|
45 |
+
|
46 |
+
.factual-score {
|
47 |
+
width: 20px;
|
48 |
+
height: 4px;
|
49 |
+
}
|
50 |
+
|
51 |
+
|
52 |
+
.hide {
|
53 |
+
display: none;
|
54 |
+
}
|
55 |
+
|
56 |
+
.html-text {
|
57 |
+
white-space: pre-line;
|
58 |
+
word-break: normal;
|
59 |
+
text-align: justify;
|
60 |
+
overflow-y: auto;
|
61 |
+
overflow-x: hidden;
|
62 |
+
height: 450px;
|
63 |
+
padding: 2px;
|
64 |
+
}
|
65 |
+
|
66 |
+
.tab {
|
67 |
+
border: none;
|
68 |
+
outline: none;
|
69 |
+
padding: 0;
|
70 |
+
margin: 0;
|
71 |
+
}
|
72 |
+
|
73 |
+
|
74 |
+
/* html text slider */
|
75 |
+
::-webkit-scrollbar {
|
76 |
+
width: 6px;
|
77 |
+
height: 6px;
|
78 |
+
}
|
79 |
+
::-webkit-scrollbar-track {
|
80 |
+
border-radius: 3px;
|
81 |
+
background: rgba(0,0,0,0.06);
|
82 |
+
-webkit-box-shadow: inset 0 0 5px rgba(0,0,0,0.08);
|
83 |
+
}
|
84 |
+
::-webkit-scrollbar-thumb {
|
85 |
+
border-radius: 3px;
|
86 |
+
background: rgba(0,0,0,0.12);
|
87 |
+
-webkit-box-shadow: inset 0 0 10px rgba(0,0,0,0.2);
|
88 |
+
}
|
89 |
+
|
90 |
+
|
91 |
+
/* split line style */
|
92 |
+
.hr-edge-weak {
|
93 |
+
border: 0;
|
94 |
+
padding-top: 1px;
|
95 |
+
background: linear-gradient(to right, transparent, #d0d0d5, transparent);
|
96 |
+
}
|
97 |
+
|
98 |
+
.hr-double-arrow {
|
99 |
+
color: #d0d0d5;
|
100 |
+
border: double;
|
101 |
+
border-width: 3px 5px;
|
102 |
+
border-color: #d0d0d5 transparent;
|
103 |
+
height: 1px;
|
104 |
+
overflow: visible;
|
105 |
+
margin-left: 20px;
|
106 |
+
margin-right: 20px;
|
107 |
+
position: relative;
|
108 |
+
}
|
109 |
+
.hr-double-arrow:before,
|
110 |
+
.hr-double-arrow:after {
|
111 |
+
content: '';
|
112 |
+
position: absolute;
|
113 |
+
width: 5px; height: 5px;
|
114 |
+
border-width: 0 3px 3px 0;
|
115 |
+
border-style: double;
|
116 |
+
top: -3px;
|
117 |
+
background: radial-gradient(2px at 1px 1px, currentColor 2px, transparent 0) no-repeat;
|
118 |
+
}
|
119 |
+
.hr-double-arrow:before {
|
120 |
+
transform: rotate(-45deg);
|
121 |
+
left: -20px;
|
122 |
+
}
|
123 |
+
.hr-double-arrow:after {
|
124 |
+
transform: rotate(135deg);
|
125 |
+
right: -20px;
|
126 |
+
}
|
127 |
+
|
128 |
+
|
129 |
+
.citation-button {
|
130 |
+
-webkit-tap-highlight-color: rgba(0,0,0,0);
|
131 |
+
-webkit-text-size-adjust: 100%;
|
132 |
+
tab-size: 4;
|
133 |
+
color-scheme: light;
|
134 |
+
word-break: break-word;
|
135 |
+
white-space: pre-wrap;
|
136 |
+
font-family: Open Sans,sans-serif!important;
|
137 |
+
box-sizing: border-box;
|
138 |
+
border: 0 solid #e5e7eb;
|
139 |
+
touch-action: manipulation;
|
140 |
+
margin-right: 1px;
|
141 |
+
display: inline-flex;
|
142 |
+
height: .75rem;
|
143 |
+
min-width: .75rem;
|
144 |
+
align-items: center;
|
145 |
+
justify-content: center;
|
146 |
+
border-radius: 9999px;
|
147 |
+
background-color: var(--block-label-background-fill);
|
148 |
+
padding-left: .25rem;
|
149 |
+
padding-right: .25rem;
|
150 |
+
vertical-align: top;
|
151 |
+
font-size: 8px;
|
152 |
+
font-weight: 600;
|
153 |
+
line-height: 0;
|
154 |
+
color: var(--block-label-text-color);
|
155 |
+
outline: none;
|
156 |
+
cursor: pointer;
|
157 |
+
transition: color 0.3s;
|
158 |
+
}
|