Kevin Hu
commited on
Commit
·
0c61e3b
1
Parent(s):
7240dd7
less text, better extraction (#1869)
Browse files### What problem does this PR solve?
#1861
### Type of change
- [x] Refactoring
- graphrag/index.py +6 -5
graphrag/index.py
CHANGED
@@ -75,10 +75,11 @@ def build_knowlege_graph_chunks(tenant_id: str, chunks: List[str], callback, ent
|
|
75 |
llm_bdl = LLMBundle(tenant_id, LLMType.CHAT, tenant.llm_id)
|
76 |
ext = GraphExtractor(llm_bdl)
|
77 |
left_token_count = llm_bdl.max_length - ext.prompt_token_count - 1024
|
78 |
-
left_token_count = max(llm_bdl.max_length * 0.
|
79 |
|
80 |
assert left_token_count > 0, f"The LLM context length({llm_bdl.max_length}) is smaller than prompt({ext.prompt_token_count})"
|
81 |
|
|
|
82 |
texts, graphs = [], []
|
83 |
cnt = 0
|
84 |
threads = []
|
@@ -86,15 +87,15 @@ def build_knowlege_graph_chunks(tenant_id: str, chunks: List[str], callback, ent
|
|
86 |
for i in range(len(chunks)):
|
87 |
tkn_cnt = num_tokens_from_string(chunks[i])
|
88 |
if cnt+tkn_cnt >= left_token_count and texts:
|
89 |
-
for b in range(0, len(texts),
|
90 |
-
threads.append(exe.submit(ext, ["\n".join(texts[b:b+
|
91 |
texts = []
|
92 |
cnt = 0
|
93 |
texts.append(chunks[i])
|
94 |
cnt += tkn_cnt
|
95 |
if texts:
|
96 |
-
for b in range(0, len(texts),
|
97 |
-
threads.append(exe.submit(ext, ["\n".join(texts[b:b+
|
98 |
|
99 |
callback(0.5, "Extracting entities.")
|
100 |
graphs = []
|
|
|
75 |
llm_bdl = LLMBundle(tenant_id, LLMType.CHAT, tenant.llm_id)
|
76 |
ext = GraphExtractor(llm_bdl)
|
77 |
left_token_count = llm_bdl.max_length - ext.prompt_token_count - 1024
|
78 |
+
left_token_count = max(llm_bdl.max_length * 0.6, left_token_count)
|
79 |
|
80 |
assert left_token_count > 0, f"The LLM context length({llm_bdl.max_length}) is smaller than prompt({ext.prompt_token_count})"
|
81 |
|
82 |
+
BATCH_SIZE=1
|
83 |
texts, graphs = [], []
|
84 |
cnt = 0
|
85 |
threads = []
|
|
|
87 |
for i in range(len(chunks)):
|
88 |
tkn_cnt = num_tokens_from_string(chunks[i])
|
89 |
if cnt+tkn_cnt >= left_token_count and texts:
|
90 |
+
for b in range(0, len(texts), BATCH_SIZE):
|
91 |
+
threads.append(exe.submit(ext, ["\n".join(texts[b:b+BATCH_SIZE])], {"entity_types": entity_types}, callback))
|
92 |
texts = []
|
93 |
cnt = 0
|
94 |
texts.append(chunks[i])
|
95 |
cnt += tkn_cnt
|
96 |
if texts:
|
97 |
+
for b in range(0, len(texts), BATCH_SIZE):
|
98 |
+
threads.append(exe.submit(ext, ["\n".join(texts[b:b+BATCH_SIZE])], {"entity_types": entity_types}, callback))
|
99 |
|
100 |
callback(0.5, "Extracting entities.")
|
101 |
graphs = []
|