Spaces:
Sleeping
Sleeping
''' | |
import os | |
import gradio as gr | |
import requests | |
from pinecone import Pinecone | |
from langchain.prompts import PromptTemplate | |
from langchain.chains.llm import LLMChain | |
from langchain.llms.base import LLM | |
from typing import Optional, List, Mapping, Any | |
from langchain.embeddings import HuggingFaceEmbeddings | |
# ----------- 1. Custom LLM to call your LitServe endpoint ----------- | |
class LitServeLLM(LLM): | |
endpoint_url: str | |
def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str: | |
payload = {"prompt": prompt} | |
response = requests.post(self.endpoint_url, json=payload) | |
if response.status_code == 200: | |
data = response.json() | |
return data.get("response", "").strip() | |
else: | |
raise ValueError(f"Request failed: {response.status_code} {response.text}") | |
@property | |
def _identifying_params(self) -> Mapping[str, Any]: | |
return {"endpoint_url": self.endpoint_url} | |
@property | |
def _llm_type(self) -> str: | |
return "litserve_llm" | |
# ----------- 2. Connect to Pinecone ----------- | |
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY") | |
pc = Pinecone(api_key=PINECONE_API_KEY) | |
index = pc.Index("rag-granite-index") | |
# ----------- 3. Load embedding model ----------- | |
embeddings_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") | |
# ----------- 4. Function to get top context from Pinecone ----------- | |
def get_retrieved_context(query: str, top_k=3): | |
query_embedding = embeddings_model.embed_query(query) | |
results = index.query( | |
namespace="rag-ns", | |
vector=query_embedding, | |
top_k=top_k, | |
include_metadata=True | |
) | |
context_parts = [match['metadata']['text'] for match in results['matches']] | |
return "\n".join(context_parts) | |
# ----------- 5. Create LLMChain with your model ----------- | |
model = LitServeLLM( | |
endpoint_url="https://8001-01k2h9d9mervcmgfn66ybkpwvq.cloudspaces.litng.ai/predict" | |
) | |
prompt = PromptTemplate( | |
input_variables=["context", "question"], | |
template=""" | |
You are a smart assistant. Based on the provided context, answer the question in 1–2 lines only. | |
If the context has more details, summarize it concisely. | |
Context: | |
{context} | |
Question: {question} | |
Answer: | |
""" | |
) | |
llm_chain = LLMChain(llm=model, prompt=prompt) | |
# ----------- 6. Main RAG Function ----------- | |
def rag_pipeline(question): | |
try: | |
retrieved_context = get_retrieved_context(question) | |
response = llm_chain.invoke({ | |
"context": retrieved_context, | |
"question": question | |
})["text"].strip() | |
# Only keep what's after "Answer:" | |
if "Answer:" in response: | |
response = response.split("Answer:", 1)[-1].strip() | |
return response | |
except Exception as e: | |
return f"Error: {str(e)}" | |
# ----------- 7. Gradio UI ----------- | |
with gr.Blocks() as demo: | |
gr.Markdown("# 🧠 RAG Chatbot (Pinecone + LitServe)") | |
question_input = gr.Textbox(label="Ask your question here") | |
answer_output = gr.Textbox(label="Answer") | |
ask_button = gr.Button("Get Answer") | |
ask_button.click(rag_pipeline, inputs=question_input, outputs=answer_output) | |
if __name__ == "__main__": | |
demo.launch() | |
''' | |
''' | |
working | |
import os | |
import gradio as gr | |
import requests | |
import mlflow | |
import dagshub | |
from pinecone import Pinecone | |
from langchain.prompts import PromptTemplate | |
from langchain.chains.llm import LLMChain | |
from langchain.llms.base import LLM | |
from typing import Optional, List, Mapping, Any | |
import time | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
from dotenv import load_dotenv | |
from datetime import datetime | |
# Load environment variables | |
pinecone_api_key = os.environ["PINECONE_API_KEY"] | |
mlflow_tracking_uri = os.environ["MLFLOW_TRACKING_URI"] | |
# ----------- DagsHub & MLflow Setup ----------- | |
dagshub.init( | |
repo_owner='prathamesh.khade20', | |
repo_name='Maintenance_AI_website', | |
mlflow=True | |
) | |
mlflow.set_tracking_uri(mlflow_tracking_uri) | |
mlflow.set_experiment("Maintenance-RAG-Chatbot") | |
mlflow.langchain.autolog() | |
# Initialize MLflow run for app configuration | |
with mlflow.start_run(run_name=f"App-Config-{datetime.now().strftime('%Y%m%d-%H%M%S')}") as setup_run: | |
# Log environment configuration | |
mlflow.log_params({ | |
"pinecone_index": "rag-granite-index", | |
"embedding_model": "all-MiniLM-L6-v2", | |
"namespace": "rag-ns", | |
"top_k": 3, | |
"llm_endpoint": "https://8001-01k2h9d9mervcmgfn66ybkpwvq.cloudspaces.litng.ai/predict" | |
}) | |
# Log important files as artifacts | |
mlflow.log_text(""" | |
You are a smart assistant. Based on the provided context, answer the question in 1–2 lines only. | |
If the context has more details, summarize it concisely. | |
Context: | |
{context} | |
Question: {question} | |
Answer: | |
""", "artifacts/prompt_template.txt") | |
# ----------- 1. Custom LLM for LitServe endpoint ----------- | |
class LitServeLLM(LLM): | |
endpoint_url: str | |
@mlflow.trace | |
def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str: | |
payload = {"prompt": prompt} | |
with mlflow.start_span("lit_serve_request"): | |
start_time = time.time() | |
response = requests.post(self.endpoint_url, json=payload) | |
latency = time.time() - start_time | |
mlflow.log_metric("lit_serve_latency", latency) | |
if response.status_code == 200: | |
data = response.json() | |
mlflow.log_metric("response_tokens", len(data.get("response", "").split())) | |
return data.get("response", "").strip() | |
else: | |
mlflow.log_metric("request_errors", 1) | |
error_info = { | |
"status_code": response.status_code, | |
"error": response.text, | |
"timestamp": datetime.now().isoformat() | |
} | |
mlflow.log_dict(error_info, "artifacts/error_log.json") | |
raise ValueError(f"Request failed: {response.status_code}") | |
@property | |
def _identifying_params(self) -> Mapping[str, Any]: | |
return {"endpoint_url": self.endpoint_url} | |
@property | |
def _llm_type(self) -> str: | |
return "litserve_llm" | |
# ----------- 2. Pinecone Connection ----------- | |
@mlflow.trace | |
def init_pinecone(): | |
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY") | |
pc = Pinecone(api_key=PINECONE_API_KEY) | |
return pc.Index("rag-granite-index") | |
index = init_pinecone() | |
# ----------- 3. Embedding Model ----------- | |
embeddings_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") | |
# ----------- 4. Context Retrieval with Tracing ----------- | |
@mlflow.trace | |
def get_retrieved_context(query: str, top_k=3): | |
"""Retrieve context from Pinecone with performance tracing""" | |
with mlflow.start_span("embedding_generation"): | |
start_time = time.time() | |
query_embedding = embeddings_model.embed_query(query) | |
mlflow.log_metric("embedding_latency", time.time() - start_time) | |
with mlflow.start_span("pinecone_query"): | |
start_time = time.time() | |
results = index.query( | |
namespace="rag-ns", | |
vector=query_embedding, | |
top_k=top_k, | |
include_metadata=True | |
) | |
mlflow.log_metric("pinecone_latency", time.time() - start_time) | |
mlflow.log_metric("retrieved_chunks", len(results['matches'])) | |
context_parts = [match['metadata']['text'] for match in results['matches']] | |
return "\n".join(context_parts) | |
# ----------- 5. LLM Chain Setup ----------- | |
model = LitServeLLM( | |
endpoint_url="https://8001-01k2h9d9mervcmgfn66ybkpwvq.cloudspaces.litng.ai/predict" | |
) | |
prompt = PromptTemplate( | |
input_variables=["context", "question"], | |
template=""" | |
You are a smart assistant. Based on the provided context, answer the question in 1–2 lines only. | |
If the context has more details, summarize it concisely. | |
Context: | |
{context} | |
Question: {question} | |
Answer: | |
""" | |
) | |
llm_chain = LLMChain(llm=model, prompt=prompt) | |
# ----------- 6. RAG Pipeline with Full Tracing ----------- | |
@mlflow.trace | |
def rag_pipeline(question): | |
"""End-to-end RAG pipeline with MLflow tracing""" | |
try: | |
# Start a new nested run for each query | |
with mlflow.start_run(run_name=f"Query-{datetime.now().strftime('%H%M%S')}", nested=True): | |
mlflow.log_param("user_question", question) | |
# Retrieve context | |
retrieved_context = get_retrieved_context(question) | |
mlflow.log_text(retrieved_context, "artifacts/retrieved_context.txt") | |
# Generate response | |
start_time = time.time() | |
response = llm_chain.invoke({ | |
"context": retrieved_context, | |
"question": question | |
})["text"].strip() | |
# Clean response | |
if "Answer:" in response: | |
response = response.split("Answer:", 1)[-1].strip() | |
# Log metrics | |
mlflow.log_metric("response_latency", time.time() - start_time) | |
mlflow.log_metric("response_length", len(response)) | |
mlflow.log_text(response, "artifacts/response.txt") | |
return response | |
except Exception as e: | |
mlflow.log_metric("pipeline_errors", 1) | |
error_info = { | |
"error": str(e), | |
"question": question, | |
"timestamp": datetime.now().isoformat() | |
} | |
mlflow.log_dict(error_info, "artifacts/pipeline_errors.json") | |
return f"Error: {str(e)}" | |
# ----------- 7. Gradio UI with Enhanced Tracking ----------- | |
with gr.Blocks() as demo: | |
gr.Markdown("# 🛠️ Maintenance AI Assistant") | |
# Track additional UI metrics | |
usage_counter = gr.State(value=0) | |
session_start = gr.State(value=datetime.now().isoformat()) | |
question_input = gr.Textbox(label="Ask your maintenance question") | |
answer_output = gr.Textbox(label="AI Response") | |
ask_button = gr.Button("Get Answer") | |
feedback = gr.Radio(["Helpful", "Not Helpful"], label="Was this response helpful?") | |
def track_usage(question, count, session_start, feedback=None): | |
"""Wrapper to track usage metrics with feedback""" | |
count += 1 | |
# Start tracking context | |
with mlflow.start_run(run_name=f"User-Interaction-{count}", nested=True): | |
mlflow.log_param("question", question) | |
mlflow.log_param("session_start", session_start) | |
# Get response | |
response = rag_pipeline(question) | |
# Log feedback if provided | |
if feedback: | |
mlflow.log_param("user_feedback", feedback) | |
mlflow.log_metric("helpful_responses", 1 if feedback == "Helpful" else 0) | |
# Update metrics | |
mlflow.log_metric("total_queries", count) | |
return response, count, session_start | |
ask_button.click( | |
track_usage, | |
inputs=[question_input, usage_counter, session_start], | |
outputs=[answer_output, usage_counter, session_start] | |
) | |
feedback.change( | |
track_usage, | |
inputs=[question_input, usage_counter, session_start, feedback], | |
outputs=[answer_output, usage_counter, session_start] | |
) | |
if __name__ == "__main__": | |
# Log deployment information | |
with mlflow.start_run(run_name="Deployment-Info"): | |
mlflow.log_params({ | |
"app_version": "1.0.0", | |
"deployment_platform": "Lightning AI", | |
"deployment_time": datetime.now().isoformat(), | |
"code_version": os.getenv("GIT_COMMIT", "dev") | |
}) | |
# Start Gradio app | |
demo.launch() | |
''' | |
import os | |
import gradio as gr | |
import requests | |
import mlflow | |
import dagshub | |
from pinecone import Pinecone | |
from langchain.prompts import PromptTemplate | |
from langchain.chains.llm import LLMChain | |
from langchain.llms.base import LLM | |
from typing import Optional, List, Mapping, Any | |
import time | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
from dotenv import load_dotenv | |
from datetime import datetime | |
# DeepEval imports | |
try: | |
from deepeval.test_case import LLMTestCase | |
from deepeval.metrics import AnswerRelevancyMetric, HallucinationMetric | |
from deepeval.metrics import BaseMetric | |
from deepeval.models.base_model import DeepEvalBaseLLM | |
except Exception: | |
raise | |
# Optional LangChain Google generative integration (Gemini) | |
try: | |
import google.generativeai as genai | |
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings | |
except Exception: | |
ChatGoogleGenerativeAI = None | |
genai = None | |
# Load environment variables | |
load_dotenv() | |
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY", "") | |
MLFLOW_TRACKING_URI = os.environ.get("MLFLOW_TRACKING_URI", "http://localhost:5000") | |
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY", "") | |
LITSERVE_ENDPOINT = os.environ.get("LITSERVE_ENDPOINT", "https://8001-01k2h9d9mervcmgfn66ybkpwvq.cloudspaces.litng.ai/predict") | |
# DagsHub & MLflow Setup (guarded) | |
try: | |
dagshub.init( | |
repo_owner='prathamesh.khade20', | |
repo_name='Maintenance_AI_website', | |
mlflow=True | |
) | |
except Exception: | |
pass | |
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI) | |
mlflow.set_experiment("Maintenance-RAG-Chatbot") | |
# ----------- App configuration logging ----------- | |
with mlflow.start_run(run_name=f"App-Config-{datetime.now().strftime('%Y%m%d-%H%M%S')}") as setup_run: | |
mlflow.log_params({ | |
"pinecone_index": "rag-granite-index", | |
"embedding_model": "all-MiniLM-L6-v2", | |
"namespace": "rag-ns", | |
"top_k": 3, | |
"llm_endpoint": LITSERVE_ENDPOINT | |
}) | |
mlflow.log_text(""" | |
You are a smart assistant. Based on the provided context, answer the question in 1–2 lines only. | |
If the context has more details, summarize it concisely. | |
Context: | |
{context} | |
Question: {question} | |
Answer: | |
""", "artifacts/prompt_template.txt") | |
# ----------- 1. Custom LLM for LitServe endpoint (Lightning AI) ----------- | |
class LitServeLLM(LLM): | |
endpoint_url: str | |
def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str: | |
payload = {"prompt": prompt} | |
with mlflow.start_span("lit_serve_request"): | |
start_time = time.time() | |
response = requests.post(self.endpoint_url, json=payload) | |
latency = time.time() - start_time | |
mlflow.log_metric("lit_serve_latency", latency) | |
if response.status_code == 200: | |
data = response.json() | |
mlflow.log_metric("response_tokens", len(data.get("response", "").split())) | |
return data.get("response", "").strip() | |
else: | |
mlflow.log_metric("request_errors", 1) | |
error_info = { | |
"status_code": response.status_code, | |
"error": response.text, | |
"timestamp": datetime.now().isoformat() | |
} | |
mlflow.log_dict(error_info, "artifacts/error_log.json") | |
raise ValueError(f"Request failed: {response.status_code}") | |
def _identifying_params(self) -> Mapping[str, Any]: | |
return {"endpoint_url": self.endpoint_url} | |
def _llm_type(self) -> str: | |
return "litserve_llm" | |
# ----------- 2. Pinecone Connection ----------- | |
def init_pinecone(): | |
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY") | |
pc = Pinecone(api_key=PINECONE_API_KEY) | |
return pc.Index("rag-granite-index") | |
try: | |
index = init_pinecone() | |
except Exception: | |
index = None | |
# ----------- 3. Embedding Model ----------- | |
embeddings_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") | |
# ----------- 4. Context Retrieval with Tracing ----------- | |
def get_retrieved_context(query: str, top_k=3): | |
with mlflow.start_span("embedding_generation"): | |
start_time = time.time() | |
query_embedding = embeddings_model.embed_query(query) | |
mlflow.log_metric("embedding_latency", time.time() - start_time) | |
if index is None: | |
return "" | |
with mlflow.start_span("pinecone_query"): | |
start_time = time.time() | |
results = index.query( | |
namespace="rag-ns", | |
vector=query_embedding, | |
top_k=top_k, | |
include_metadata=True | |
) | |
mlflow.log_metric("pinecone_latency", time.time() - start_time) | |
mlflow.log_metric("retrieved_chunks", len(results['matches'])) | |
context_parts = [match['metadata']['text'] for match in results['matches']] | |
return " | |
".join(context_parts) | |
with mlflow.start_span("pinecone_query"): | |
start_time = time.time() | |
results = index.query( | |
namespace="rag-ns", | |
vector=query_embedding, | |
top_k=top_k, | |
include_metadata=True | |
) | |
mlflow.log_metric("pinecone_latency", time.time() - start_time) | |
mlflow.log_metric("retrieved_chunks", len(results['matches'])) | |
context_parts = [match['metadata']['text'] for match in results['matches']] | |
return " | |
".join(context_parts) | |
# ----------- 5. LLM Chain Setup (Lightning AI generator) ----------- | |
model = LitServeLLM(endpoint_url=LITSERVE_ENDPOINT) | |
prompt = PromptTemplate( | |
input_variables=["context", "question"], | |
template=""" | |
You are a smart assistant. Based on the provided context, answer the question in 1–2 lines only. | |
If the context has more details, summarize it concisely. | |
Context: | |
{context} | |
Question: {question} | |
Answer: | |
""" | |
) | |
llm_chain = LLMChain(llm=model, prompt=prompt) | |
# ----------- 6. RAG Pipeline with Full Tracing (uses Lightning AI) ----------- | |
def rag_pipeline(question): | |
try: | |
with mlflow.start_run(run_name=f"Query-{datetime.now().strftime('%H%M%S')}", nested=True): | |
mlflow.log_param("user_question", question) | |
retrieved_context = get_retrieved_context(question) | |
mlflow.log_text(retrieved_context, "artifacts/retrieved_context.txt") | |
start_time = time.time() | |
response_obj = llm_chain.invoke({ | |
"context": retrieved_context, | |
"question": question | |
}) | |
response = response_obj.get("text") if isinstance(response_obj, dict) else getattr(response_obj, "text", str(response_obj)) | |
response = response.strip() | |
if "Answer:" in response: | |
response = response.split("Answer:", 1)[-1].strip() | |
mlflow.log_metric("response_latency", time.time() - start_time) | |
mlflow.log_metric("response_length", len(response)) | |
mlflow.log_text(response, "artifacts/response.txt") | |
return response | |
except Exception as e: | |
mlflow.log_metric("pipeline_errors", 1) | |
error_info = { | |
"error": str(e), | |
"question": question, | |
"timestamp": datetime.now().isoformat() | |
} | |
mlflow.log_dict(error_info, "artifacts/pipeline_errors.json") | |
return f"Error: {str(e)}" | |
# ----------- 7. DeepEval Wrapper(s) and Metrics Integration (Gemini evaluation) ----------- | |
class GoogleVertexAI(DeepEvalBaseLLM): | |
def __init__(self, model): | |
self.model = model | |
def load_model(self): | |
return self.model | |
def generate(self, prompt: str) -> str: | |
chat_model = self.load_model() | |
res = chat_model.invoke(prompt) | |
if hasattr(res, 'content'): | |
return res.content | |
if isinstance(res, dict): | |
return res.get('content') or res.get('text') or str(res) | |
return str(res) | |
async def a_generate(self, prompt: str) -> str: | |
chat_model = self.load_model() | |
res = await chat_model.ainvoke(prompt) | |
return getattr(res, 'content', str(res)) | |
def get_model_name(self): | |
return "Vertex AI Model" | |
class LitServeWrapper(DeepEvalBaseLLM): | |
def __init__(self, lit_llm: LitServeLLM): | |
self.lit_llm = lit_llm | |
def load_model(self): | |
return self.lit_llm | |
def generate(self, prompt: str) -> str: | |
return self.lit_llm._call(prompt) | |
async def a_generate(self, prompt: str) -> str: | |
return self.generate(prompt) | |
def get_model_name(self): | |
return "LitServeModel" | |
# Custom metric that DOES NOT require expected_output: Length-based utility metric | |
class LengthMetric(BaseMetric): | |
def __init__(self, min_tokens: int = 1, max_tokens: int = 200): | |
self.min_tokens = min_tokens | |
self.max_tokens = max_tokens | |
self.score = 0.0 | |
self.success = False | |
def measure(self, test_case: LLMTestCase): | |
text = (test_case.actual_output or "") | |
tokens = len(text.split()) | |
mid = (self.min_tokens + self.max_tokens) / 2 | |
dist = abs(tokens - mid) | |
max_dist = max(mid - self.min_tokens, self.max_tokens - mid) | |
self.score = max(0.0, 1.0 - (dist / max_dist)) | |
self.success = (self.min_tokens <= tokens <= self.max_tokens) | |
return self.score | |
async def a_measure(self, test_case: LLMTestCase): | |
return self.measure(test_case) | |
def is_successful(self): | |
return self.success | |
def name(self): | |
return "Length Metric" | |
# Helper to get eval model: GEMINI will be used as evaluator by default | |
def get_deepeval_model(choice: str = 'gemini'): | |
if choice == 'gemini' and ChatGoogleGenerativeAI is not None and GOOGLE_API_KEY: | |
try: | |
genai.configure(api_key=GOOGLE_API_KEY) | |
except Exception: | |
pass | |
chat_model = ChatGoogleGenerativeAI(model="gemini-pro", google_api_key=GOOGLE_API_KEY) | |
return GoogleVertexAI(model=chat_model) | |
else: | |
# fallback to litserve wrapper if gemini isn't available | |
return LitServeWrapper(lit_llm=model) | |
# Function to run Deepeval tests and log to mlflow (only metrics that don't need expected_output) | |
def run_deepeval_tests(test_cases: List[LLMTestCase], eval_model_choice: str = 'gemini'): | |
model_wrapper = get_deepeval_model(eval_model_choice) | |
# Only metrics that do not require expected output | |
answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.5, model=model_wrapper) | |
hallucination_metric = HallucinationMetric(threshold=0.5, model=model_wrapper) | |
length_metric = LengthMetric(min_tokens=3, max_tokens=200) | |
results = [] | |
with mlflow.start_run(run_name=f"DeepEval-{datetime.now().strftime('%H%M%S')}", nested=True): | |
for i, tc in enumerate(test_cases): | |
mlflow.log_param(f"tc_{i}_input", tc.input) | |
mlflow.log_param(f"tc_{i}_actual", tc.actual_output) | |
if tc.context: | |
mlflow.log_text(" | |
".join(tc.context), f"artifacts/tc_{i}_context.txt") | |
# Measure metrics | |
answer_relevancy_metric.measure(tc) | |
hallucination_metric.measure(tc) | |
length_metric.measure(tc) | |
entry = { | |
"input": tc.input, | |
"actual_output": tc.actual_output, | |
"context": tc.context, | |
"answer_relevancy_score": answer_relevancy_metric.score, | |
"hallucination_score": hallucination_metric.score, | |
"length_score": length_metric.score | |
} | |
# Log metrics to mlflow | |
mlflow.log_metric(f"tc_{i}_answer_relevancy", answer_relevancy_metric.score) | |
mlflow.log_metric(f"tc_{i}_hallucination", hallucination_metric.score) | |
mlflow.log_metric(f"tc_{i}_length", length_metric.score) | |
results.append(entry) | |
return results | |
# ----------- 8. Gradio UI with Evaluation Tab (Auto-generate actual output from Lightning AI) ----------- | |
with gr.Blocks() as demo: | |
gr.Markdown("# 🛠️ Maintenance AI Assistant + DeepEval (Lightning AI generator, Gemini evaluator)") | |
with gr.Tabs(): | |
with gr.TabItem("Chat (RAG)"): | |
usage_counter = gr.State(value=0) | |
session_start = gr.State(value=datetime.now().isoformat()) | |
question_input = gr.Textbox(label="Ask your maintenance question") | |
answer_output = gr.Textbox(label="AI Response") | |
ask_button = gr.Button("Get Answer") | |
feedback = gr.Radio(["Helpful", "Not Helpful"], label="Was this response helpful?") | |
def track_usage(question, count, session_start, feedback=None): | |
count += 1 | |
with mlflow.start_run(run_name=f"User-Interaction-{count}", nested=True): | |
mlflow.log_param("question", question) | |
mlflow.log_param("session_start", session_start) | |
response = rag_pipeline(question) | |
if feedback: | |
mlflow.log_param("user_feedback", feedback) | |
mlflow.log_metric("helpful_responses", 1 if feedback == "Helpful" else 0) | |
mlflow.log_metric("total_queries", count) | |
return response, count, session_start | |
ask_button.click( | |
track_usage, | |
inputs=[question_input, usage_counter, session_start], | |
outputs=[answer_output, usage_counter, session_start] | |
) | |
feedback.change( | |
track_usage, | |
inputs=[question_input, usage_counter, session_start, feedback], | |
outputs=[answer_output, usage_counter, session_start] | |
) | |
with gr.TabItem("DeepEval — Model Tests"): | |
gr.Markdown("### Run DeepEval metrics (no expected output needed). Provide input; optionally auto-generate the model response (Lightning AI). Gemini will evaluate by default.") | |
tc_input = gr.Textbox(label="Test Input (prompt)") | |
tc_actual = gr.Textbox(label="Actual Output (paste model response or leave empty to auto-generate)") | |
tc_context = gr.Textbox(label="Context (optional)") | |
auto_generate = gr.Checkbox(label="Auto-generate actual output from RAG (Lightning AI)", value=True) | |
model_choice = gr.Radio(["gemini", "litserve"], value="gemini", label="Evaluation backend (Gemini recommended)") | |
run_button = gr.Button("Run DeepEval") | |
eval_output = gr.JSON(label="Evaluation Results") | |
def run_single_eval(inp, actual, context, autogen, eval_backend): | |
# If autogen is True, generate actual output via RAG pipeline (Lightning AI) | |
if autogen or (actual is None or actual.strip() == ""): | |
generated = rag_pipeline(inp) | |
actual_output = generated | |
else: | |
actual_output = actual | |
# Log that actual was autogenerated | |
with mlflow.start_run(run_name=f"DE-Run-{datetime.now().strftime('%H%M%S')}", nested=True): | |
mlflow.log_param("input", inp) | |
mlflow.log_param("autogenerated_actual", autogen) | |
if context: | |
mlflow.log_text(context, "artifacts/eval_context.txt") | |
tc = LLMTestCase(input=inp, actual_output=actual_output, expected_output=None, context=[context] if context else None) | |
results = run_deepeval_tests([tc], eval_model_choice=eval_backend) | |
return results | |
run_button.click( | |
run_single_eval, | |
inputs=[tc_input, tc_actual, tc_context, auto_generate, model_choice], | |
outputs=[eval_output] | |
) | |
if __name__ == "__main__": | |
with mlflow.start_run(run_name="Deployment-Info"): | |
mlflow.log_params({ | |
"app_version": "1.3.0", | |
"deployment_platform": "Lightning AI / HuggingFace Space", | |
"deployment_time": datetime.now().isoformat(), | |
"code_version": os.getenv("GIT_COMMIT", "dev") | |
}) | |
demo.launch() | |