Spaces:
Running
on
T4
Running
on
T4
Update app.py
Browse files
app.py
CHANGED
|
@@ -12,7 +12,7 @@ from auditqa.sample_questions import QUESTIONS
|
|
| 12 |
from auditqa.reports import files, report_list, new_files, new_report_list
|
| 13 |
from auditqa.process_chunks import load_chunks, getconfig, get_local_qdrant
|
| 14 |
from auditqa.retriever import get_context
|
| 15 |
-
from auditqa.reader import nvidia_client, dedicated_endpoint, serverless_api
|
| 16 |
from auditqa.utils import make_html_source, parse_output_llm_with_sources, save_logs, get_message_template, get_client_location, get_client_ip, get_platform_info
|
| 17 |
from dotenv import load_dotenv
|
| 18 |
load_dotenv()
|
|
@@ -304,6 +304,34 @@ async def chat(query,history, method, sources,reports,subtype, client_ip=None, s
|
|
| 304 |
async for update in process_stream():
|
| 305 |
yield update
|
| 306 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 307 |
elif model_config.get('reader','TYPE') == 'DEDICATED':
|
| 308 |
chat_model = dedicated_endpoint()
|
| 309 |
### adding for assessing computation time
|
|
|
|
| 12 |
from auditqa.reports import files, report_list, new_files, new_report_list
|
| 13 |
from auditqa.process_chunks import load_chunks, getconfig, get_local_qdrant
|
| 14 |
from auditqa.retriever import get_context
|
| 15 |
+
from auditqa.reader import nvidia_client, dedicated_endpoint, serverless_api, inf_provider
|
| 16 |
from auditqa.utils import make_html_source, parse_output_llm_with_sources, save_logs, get_message_template, get_client_location, get_client_ip, get_platform_info
|
| 17 |
from dotenv import load_dotenv
|
| 18 |
load_dotenv()
|
|
|
|
| 304 |
async for update in process_stream():
|
| 305 |
yield update
|
| 306 |
|
| 307 |
+
elif model_config.get('reader','TYPE') == 'INF_PROVIDERS':
|
| 308 |
+
chat_model = inf_provider()
|
| 309 |
+
start_time = time.time()
|
| 310 |
+
async def process_stream():
|
| 311 |
+
nonlocal answer_yet # Use the outer scope's answer_yet variable
|
| 312 |
+
# Without nonlocal, Python would create a new local variable answer_yet inside process_stream(),
|
| 313 |
+
# instead of modifying the one from the outer scope.
|
| 314 |
+
# Iterate over the streaming response chunks
|
| 315 |
+
response = chat_model.chat.completions.create(
|
| 316 |
+
model=model_config.get("reader","INF_PROVIDER_MODEL"),
|
| 317 |
+
messages = messages,
|
| 318 |
+
stream= True,
|
| 319 |
+
max_tokens=int(model_config.get('reader','MAX_TOKENS')),
|
| 320 |
+
)
|
| 321 |
+
for message in response:
|
| 322 |
+
token = message.choices[0].delta.content
|
| 323 |
+
if token:
|
| 324 |
+
answer_yet += token
|
| 325 |
+
parsed_answer = parse_output_llm_with_sources(answer_yet)
|
| 326 |
+
history[-1] = (query, parsed_answer)
|
| 327 |
+
logs_data["answer"] = parsed_answer
|
| 328 |
+
yield [tuple(x) for x in history], docs_html, logs_data, session_id
|
| 329 |
+
|
| 330 |
+
# Stream the response updates
|
| 331 |
+
async for update in process_stream():
|
| 332 |
+
yield update
|
| 333 |
+
|
| 334 |
+
|
| 335 |
elif model_config.get('reader','TYPE') == 'DEDICATED':
|
| 336 |
chat_model = dedicated_endpoint()
|
| 337 |
### adding for assessing computation time
|