devMls Miguel commited on
Commit
ffeefff
·
1 Parent(s): 5548e5b

organize chunks by document in the prompt (#3925)

Browse files

### What problem does this PR solve?

This PR organize chunks in the prompt by document and indicate what is
the name of the document in this way

```
Document: {doc_name} \nContains the following relevant fragments:
chunk1
chunk2
chunk3

Document: {doc_name} \nContains the following relevant fragments:
chunk4
chunk5
```

Maybe can be a baseline to add metadata to the documents.

This allow in my case to improve llm context about the orgin of the
information.


### Type of change

- [X] New Feature (non-breaking change which adds functionality)

Co-authored-by: Miguel <your-noreply-github-email>

Files changed (1) hide show
  1. api/db/services/dialog_service.py +54 -1
api/db/services/dialog_service.py CHANGED
@@ -195,7 +195,32 @@ def chat(dialog, messages, stream=True, **kwargs):
195
  dialog.vector_similarity_weight,
196
  doc_ids=attachments,
197
  top=dialog.top_k, aggs=False, rerank_mdl=rerank_mdl)
198
- knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  logging.debug(
200
  "{}->{}".format(" ".join(questions), "\n->".join(knowledges)))
201
  retrieval_tm = timer()
@@ -592,12 +617,40 @@ def ask(question, kb_ids, tenant_id):
592
  knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]]
593
 
594
  used_token_count = 0
 
595
  for i, c in enumerate(knowledges):
596
  used_token_count += num_tokens_from_string(c)
597
  if max_tokens * 0.97 < used_token_count:
598
  knowledges = knowledges[:i]
 
599
  break
600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
601
  prompt = """
602
  Role: You're a smart assistant. Your name is Miss R.
603
  Task: Summarize the information from knowledge bases and answer user's question.
 
195
  dialog.vector_similarity_weight,
196
  doc_ids=attachments,
197
  top=dialog.top_k, aggs=False, rerank_mdl=rerank_mdl)
198
+
199
+ # Group chunks by document ID
200
+ doc_chunks = {}
201
+ for ck in kbinfos["chunks"]:
202
+ doc_id = ck["doc_id"]
203
+ if doc_id not in doc_chunks:
204
+ doc_chunks[doc_id] = []
205
+ doc_chunks[doc_id].append(ck["content_with_weight"])
206
+
207
+ # Create knowledges list with grouped chunks
208
+ knowledges = []
209
+ for doc_id, chunks in doc_chunks.items():
210
+ # Find the corresponding document name
211
+ doc_name = next((d["doc_name"] for d in kbinfos.get("doc_aggs", []) if d["doc_id"] == doc_id), doc_id)
212
+
213
+ # Create a header for the document
214
+ doc_knowledge = f"Document: {doc_name} \nContains the following relevant fragments:\n"
215
+
216
+ # Add numbered fragments
217
+ for i, chunk in enumerate(chunks, 1):
218
+ doc_knowledge += f"{i}. {chunk}\n"
219
+
220
+ knowledges.append(doc_knowledge)
221
+
222
+
223
+
224
  logging.debug(
225
  "{}->{}".format(" ".join(questions), "\n->".join(knowledges)))
226
  retrieval_tm = timer()
 
617
  knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]]
618
 
619
  used_token_count = 0
620
+ chunks_num = 0
621
  for i, c in enumerate(knowledges):
622
  used_token_count += num_tokens_from_string(c)
623
  if max_tokens * 0.97 < used_token_count:
624
  knowledges = knowledges[:i]
625
+ chunks_num = chunks_num + 1
626
  break
627
 
628
+ # Group chunks by document ID
629
+ doc_chunks = {}
630
+ counter_chunks = 0
631
+ for ck in kbinfos["chunks"]:
632
+ if counter_chunks < chunks_num:
633
+ counter_chunks = counter_chunks + 1
634
+ doc_id = ck["doc_id"]
635
+ if doc_id not in doc_chunks:
636
+ doc_chunks[doc_id] = []
637
+ doc_chunks[doc_id].append(ck["content_with_weight"])
638
+
639
+ # Create knowledges list with grouped chunks
640
+ knowledges = []
641
+ for doc_id, chunks in doc_chunks.items():
642
+ # Find the corresponding document name
643
+ doc_name = next((d["doc_name"] for d in kbinfos.get("doc_aggs", []) if d["doc_id"] == doc_id), doc_id)
644
+
645
+ # Create a header for the document
646
+ doc_knowledge = f"Document: {doc_name} \nContains the following relevant fragments:\n"
647
+
648
+ # Add numbered fragments
649
+ for i, chunk in enumerate(chunks, 1):
650
+ doc_knowledge += f"{i}. {chunk}\n"
651
+
652
+ knowledges.append(doc_knowledge)
653
+
654
  prompt = """
655
  Role: You're a smart assistant. Your name is Miss R.
656
  Task: Summarize the information from knowledge bases and answer user's question.