Kevin Hu
commited on
Commit
·
5e4c165
1
Parent(s):
84758ca
Fetch chunk by batches. (#4177)
Browse files### What problem does this PR solve?
#4173
### Type of change
- [x] Performance Improvement
- rag/nlp/search.py +11 -4
- rag/utils/es_conn.py +1 -1
rag/nlp/search.py
CHANGED
@@ -70,7 +70,7 @@ class Dealer:
|
|
70 |
pg = int(req.get("page", 1)) - 1
|
71 |
topk = int(req.get("topk", 1024))
|
72 |
ps = int(req.get("size", topk))
|
73 |
-
offset, limit = pg * ps,
|
74 |
|
75 |
src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd", "position_int",
|
76 |
"doc_id", "page_num_int", "top_int", "create_timestamp_flt", "knowledge_graph_kwd", "question_kwd", "question_tks",
|
@@ -380,6 +380,13 @@ class Dealer:
|
|
380 |
|
381 |
def chunk_list(self, doc_id: str, tenant_id: str, kb_ids: list[str], max_count=1024, fields=["docnm_kwd", "content_with_weight", "img_id"]):
|
382 |
condition = {"doc_id": doc_id}
|
383 |
-
res =
|
384 |
-
|
385 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
pg = int(req.get("page", 1)) - 1
|
71 |
topk = int(req.get("topk", 1024))
|
72 |
ps = int(req.get("size", topk))
|
73 |
+
offset, limit = pg * ps, ps
|
74 |
|
75 |
src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd", "position_int",
|
76 |
"doc_id", "page_num_int", "top_int", "create_timestamp_flt", "knowledge_graph_kwd", "question_kwd", "question_tks",
|
|
|
380 |
|
381 |
def chunk_list(self, doc_id: str, tenant_id: str, kb_ids: list[str], max_count=1024, fields=["docnm_kwd", "content_with_weight", "img_id"]):
|
382 |
condition = {"doc_id": doc_id}
|
383 |
+
res = []
|
384 |
+
bs = 128
|
385 |
+
for p in range(0, max_count, bs):
|
386 |
+
res = self.dataStore.search(fields, [], condition, [], OrderByExpr(), p, bs, index_name(tenant_id), kb_ids)
|
387 |
+
dict_chunks = self.dataStore.getFields(res, fields)
|
388 |
+
if dict_chunks:
|
389 |
+
res.extend(dict_chunks.values())
|
390 |
+
if len(dict_chunks.values()) < bs:
|
391 |
+
break
|
392 |
+
return res
|
rag/utils/es_conn.py
CHANGED
@@ -196,7 +196,7 @@ class ESConnection(DocStoreConnection):
|
|
196 |
s = s.sort(*orders)
|
197 |
|
198 |
if limit > 0:
|
199 |
-
s = s[offset:limit]
|
200 |
q = s.to_dict()
|
201 |
logger.debug(f"ESConnection.search {str(indexNames)} query: " + json.dumps(q))
|
202 |
|
|
|
196 |
s = s.sort(*orders)
|
197 |
|
198 |
if limit > 0:
|
199 |
+
s = s[offset:offset+limit]
|
200 |
q = s.to_dict()
|
201 |
logger.debug(f"ESConnection.search {str(indexNames)} query: " + json.dumps(q))
|
202 |
|