Kevin Hu commited on
Commit
75f6aef
·
1 Parent(s): 8ce7a30

accelerate term weight calculation (#3206)

Browse files

### What problem does this PR solve?



### Type of change

- [x] Performance Improvement

rag/llm/__init__.py CHANGED
@@ -83,7 +83,6 @@ ChatModel = {
83
  "VolcEngine": VolcEngineChat,
84
  "BaiChuan": BaiChuanChat,
85
  "MiniMax": MiniMaxChat,
86
- "Minimax": MiniMaxChat,
87
  "Mistral": MistralChat,
88
  "Gemini": GeminiChat,
89
  "Bedrock": BedrockChat,
 
83
  "VolcEngine": VolcEngineChat,
84
  "BaiChuan": BaiChuanChat,
85
  "MiniMax": MiniMaxChat,
 
86
  "Mistral": MistralChat,
87
  "Gemini": GeminiChat,
88
  "Bedrock": BedrockChat,
rag/nlp/query.py CHANGED
@@ -165,7 +165,7 @@ class EsQueryer:
165
  d = {}
166
  if isinstance(tks, str):
167
  tks = tks.split(" ")
168
- for t, c in self.tw.weights(tks):
169
  if t not in d:
170
  d[t] = 0
171
  d[t] += c
@@ -177,9 +177,9 @@ class EsQueryer:
177
 
178
  def similarity(self, qtwt, dtwt):
179
  if isinstance(dtwt, type("")):
180
- dtwt = {t: w for t, w in self.tw.weights(self.tw.split(dtwt))}
181
  if isinstance(qtwt, type("")):
182
- qtwt = {t: w for t, w in self.tw.weights(self.tw.split(qtwt))}
183
  s = 1e-9
184
  for k, v in qtwt.items():
185
  if k in dtwt:
 
165
  d = {}
166
  if isinstance(tks, str):
167
  tks = tks.split(" ")
168
+ for t, c in self.tw.weights(tks, preprocess=False):
169
  if t not in d:
170
  d[t] = 0
171
  d[t] += c
 
177
 
178
  def similarity(self, qtwt, dtwt):
179
  if isinstance(dtwt, type("")):
180
+ dtwt = {t: w for t, w in self.tw.weights(self.tw.split(dtwt), preprocess=False)}
181
  if isinstance(qtwt, type("")):
182
+ qtwt = {t: w for t, w in self.tw.weights(self.tw.split(qtwt), preprocess=False)}
183
  s = 1e-9
184
  for k, v in qtwt.items():
185
  if k in dtwt:
rag/nlp/term_weight.py CHANGED
@@ -1,4 +1,4 @@
1
- #
2
  # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
3
  #
4
  # Licensed under the Apache License, Version 2.0 (the "License");
@@ -158,7 +158,7 @@ class Dealer:
158
  tks.append(t)
159
  return tks
160
 
161
- def weights(self, tks):
162
  def skill(t):
163
  if t not in self.sk:
164
  return 1
@@ -222,14 +222,20 @@ class Dealer:
222
  def idf(s, N): return math.log10(10 + ((N - s + 0.5) / (s + 0.5)))
223
 
224
  tw = []
225
- for tk in tks:
226
- tt = self.tokenMerge(self.pretoken(tk, True))
227
- idf1 = np.array([idf(freq(t), 10000000) for t in tt])
228
- idf2 = np.array([idf(df(t), 1000000000) for t in tt])
229
  wts = (0.3 * idf1 + 0.7 * idf2) * \
230
- np.array([ner(t) * postag(t) for t in tt])
231
-
232
- tw.extend(zip(tt, wts))
 
 
 
 
 
 
 
233
 
234
  S = np.sum([s for _, s in tw])
235
  return [(t, s / S) for t, s in tw]
 
1
+ #
2
  # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
3
  #
4
  # Licensed under the Apache License, Version 2.0 (the "License");
 
158
  tks.append(t)
159
  return tks
160
 
161
+ def weights(self, tks, preprocess=True):
162
  def skill(t):
163
  if t not in self.sk:
164
  return 1
 
222
  def idf(s, N): return math.log10(10 + ((N - s + 0.5) / (s + 0.5)))
223
 
224
  tw = []
225
+ if not preprocess:
226
+ idf1 = np.array([idf(freq(t), 10000000) for t in tks])
227
+ idf2 = np.array([idf(df(t), 1000000000) for t in tks])
 
228
  wts = (0.3 * idf1 + 0.7 * idf2) * \
229
+ np.array([ner(t) * postag(t) for t in tks])
230
+ tw = zip(tks, wts)
231
+ else:
232
+ for tk in tks:
233
+ tt = self.tokenMerge(self.pretoken(tk, True))
234
+ idf1 = np.array([idf(freq(t), 10000000) for t in tt])
235
+ idf2 = np.array([idf(df(t), 1000000000) for t in tt])
236
+ wts = (0.3 * idf1 + 0.7 * idf2) * \
237
+ np.array([ner(t) * postag(t) for t in tt])
238
+ tw.extend(zip(tt, wts))
239
 
240
  S = np.sum([s for _, s in tw])
241
  return [(t, s / S) for t, s in tw]