Kevin Hu commited on
Commit
5e4c165
·
1 Parent(s): 84758ca

Fetch chunk by batches. (#4177)

Browse files

### What problem does this PR solve?

#4173

### Type of change

- [x] Performance Improvement

Files changed (2) hide show
  1. rag/nlp/search.py +11 -4
  2. rag/utils/es_conn.py +1 -1
rag/nlp/search.py CHANGED
@@ -70,7 +70,7 @@ class Dealer:
70
  pg = int(req.get("page", 1)) - 1
71
  topk = int(req.get("topk", 1024))
72
  ps = int(req.get("size", topk))
73
- offset, limit = pg * ps, (pg + 1) * ps
74
 
75
  src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd", "position_int",
76
  "doc_id", "page_num_int", "top_int", "create_timestamp_flt", "knowledge_graph_kwd", "question_kwd", "question_tks",
@@ -380,6 +380,13 @@ class Dealer:
380
 
381
  def chunk_list(self, doc_id: str, tenant_id: str, kb_ids: list[str], max_count=1024, fields=["docnm_kwd", "content_with_weight", "img_id"]):
382
  condition = {"doc_id": doc_id}
383
- res = self.dataStore.search(fields, [], condition, [], OrderByExpr(), 0, max_count, index_name(tenant_id), kb_ids)
384
- dict_chunks = self.dataStore.getFields(res, fields)
385
- return dict_chunks.values()
 
 
 
 
 
 
 
 
70
  pg = int(req.get("page", 1)) - 1
71
  topk = int(req.get("topk", 1024))
72
  ps = int(req.get("size", topk))
73
+ offset, limit = pg * ps, ps
74
 
75
  src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd", "position_int",
76
  "doc_id", "page_num_int", "top_int", "create_timestamp_flt", "knowledge_graph_kwd", "question_kwd", "question_tks",
 
380
 
381
  def chunk_list(self, doc_id: str, tenant_id: str, kb_ids: list[str], max_count=1024, fields=["docnm_kwd", "content_with_weight", "img_id"]):
382
  condition = {"doc_id": doc_id}
383
+ res = []
384
+ bs = 128
385
+ for p in range(0, max_count, bs):
386
+ res = self.dataStore.search(fields, [], condition, [], OrderByExpr(), p, bs, index_name(tenant_id), kb_ids)
387
+ dict_chunks = self.dataStore.getFields(res, fields)
388
+ if dict_chunks:
389
+ res.extend(dict_chunks.values())
390
+ if len(dict_chunks.values()) < bs:
391
+ break
392
+ return res
rag/utils/es_conn.py CHANGED
@@ -196,7 +196,7 @@ class ESConnection(DocStoreConnection):
196
  s = s.sort(*orders)
197
 
198
  if limit > 0:
199
- s = s[offset:limit]
200
  q = s.to_dict()
201
  logger.debug(f"ESConnection.search {str(indexNames)} query: " + json.dumps(q))
202
 
 
196
  s = s.sort(*orders)
197
 
198
  if limit > 0:
199
+ s = s[offset:offset+limit]
200
  q = s.to_dict()
201
  logger.debug(f"ESConnection.search {str(indexNames)} query: " + json.dumps(q))
202