Kevin Hu
commited on
Commit
·
75f6aef
1
Parent(s):
8ce7a30
accelerate term weight calculation (#3206)
Browse files### What problem does this PR solve?
### Type of change
- [x] Performance Improvement
- rag/llm/__init__.py +0 -1
- rag/nlp/query.py +3 -3
- rag/nlp/term_weight.py +15 -9
rag/llm/__init__.py
CHANGED
@@ -83,7 +83,6 @@ ChatModel = {
|
|
83 |
"VolcEngine": VolcEngineChat,
|
84 |
"BaiChuan": BaiChuanChat,
|
85 |
"MiniMax": MiniMaxChat,
|
86 |
-
"Minimax": MiniMaxChat,
|
87 |
"Mistral": MistralChat,
|
88 |
"Gemini": GeminiChat,
|
89 |
"Bedrock": BedrockChat,
|
|
|
83 |
"VolcEngine": VolcEngineChat,
|
84 |
"BaiChuan": BaiChuanChat,
|
85 |
"MiniMax": MiniMaxChat,
|
|
|
86 |
"Mistral": MistralChat,
|
87 |
"Gemini": GeminiChat,
|
88 |
"Bedrock": BedrockChat,
|
rag/nlp/query.py
CHANGED
@@ -165,7 +165,7 @@ class EsQueryer:
|
|
165 |
d = {}
|
166 |
if isinstance(tks, str):
|
167 |
tks = tks.split(" ")
|
168 |
-
for t, c in self.tw.weights(tks):
|
169 |
if t not in d:
|
170 |
d[t] = 0
|
171 |
d[t] += c
|
@@ -177,9 +177,9 @@ class EsQueryer:
|
|
177 |
|
178 |
def similarity(self, qtwt, dtwt):
|
179 |
if isinstance(dtwt, type("")):
|
180 |
-
dtwt = {t: w for t, w in self.tw.weights(self.tw.split(dtwt))}
|
181 |
if isinstance(qtwt, type("")):
|
182 |
-
qtwt = {t: w for t, w in self.tw.weights(self.tw.split(qtwt))}
|
183 |
s = 1e-9
|
184 |
for k, v in qtwt.items():
|
185 |
if k in dtwt:
|
|
|
165 |
d = {}
|
166 |
if isinstance(tks, str):
|
167 |
tks = tks.split(" ")
|
168 |
+
for t, c in self.tw.weights(tks, preprocess=False):
|
169 |
if t not in d:
|
170 |
d[t] = 0
|
171 |
d[t] += c
|
|
|
177 |
|
178 |
def similarity(self, qtwt, dtwt):
|
179 |
if isinstance(dtwt, type("")):
|
180 |
+
dtwt = {t: w for t, w in self.tw.weights(self.tw.split(dtwt), preprocess=False)}
|
181 |
if isinstance(qtwt, type("")):
|
182 |
+
qtwt = {t: w for t, w in self.tw.weights(self.tw.split(qtwt), preprocess=False)}
|
183 |
s = 1e-9
|
184 |
for k, v in qtwt.items():
|
185 |
if k in dtwt:
|
rag/nlp/term_weight.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
#
|
2 |
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
3 |
#
|
4 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
@@ -158,7 +158,7 @@ class Dealer:
|
|
158 |
tks.append(t)
|
159 |
return tks
|
160 |
|
161 |
-
def weights(self, tks):
|
162 |
def skill(t):
|
163 |
if t not in self.sk:
|
164 |
return 1
|
@@ -222,14 +222,20 @@ class Dealer:
|
|
222 |
def idf(s, N): return math.log10(10 + ((N - s + 0.5) / (s + 0.5)))
|
223 |
|
224 |
tw = []
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
idf2 = np.array([idf(df(t), 1000000000) for t in tt])
|
229 |
wts = (0.3 * idf1 + 0.7 * idf2) * \
|
230 |
-
np.array([ner(t) * postag(t) for t in
|
231 |
-
|
232 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
233 |
|
234 |
S = np.sum([s for _, s in tw])
|
235 |
return [(t, s / S) for t, s in tw]
|
|
|
1 |
+
#
|
2 |
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
3 |
#
|
4 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
158 |
tks.append(t)
|
159 |
return tks
|
160 |
|
161 |
+
def weights(self, tks, preprocess=True):
|
162 |
def skill(t):
|
163 |
if t not in self.sk:
|
164 |
return 1
|
|
|
222 |
def idf(s, N): return math.log10(10 + ((N - s + 0.5) / (s + 0.5)))
|
223 |
|
224 |
tw = []
|
225 |
+
if not preprocess:
|
226 |
+
idf1 = np.array([idf(freq(t), 10000000) for t in tks])
|
227 |
+
idf2 = np.array([idf(df(t), 1000000000) for t in tks])
|
|
|
228 |
wts = (0.3 * idf1 + 0.7 * idf2) * \
|
229 |
+
np.array([ner(t) * postag(t) for t in tks])
|
230 |
+
tw = zip(tks, wts)
|
231 |
+
else:
|
232 |
+
for tk in tks:
|
233 |
+
tt = self.tokenMerge(self.pretoken(tk, True))
|
234 |
+
idf1 = np.array([idf(freq(t), 10000000) for t in tt])
|
235 |
+
idf2 = np.array([idf(df(t), 1000000000) for t in tt])
|
236 |
+
wts = (0.3 * idf1 + 0.7 * idf2) * \
|
237 |
+
np.array([ner(t) * postag(t) for t in tt])
|
238 |
+
tw.extend(zip(tt, wts))
|
239 |
|
240 |
S = np.sum([s for _, s in tw])
|
241 |
return [(t, s / S) for t, s in tw]
|