Spaces:

Prathamesh1420
/

Maintenance_website

Sleeping

App Files Files Community

Maintenance_website / app.py

Prathamesh1420

Update app.py

ca246a0 verified about 2 months ago

raw

history blame

28 kB

	'''
	import os
	import gradio as gr
	import requests
	from pinecone import Pinecone
	from langchain.prompts import PromptTemplate
	from langchain.chains.llm import LLMChain
	from langchain.llms.base import LLM
	from typing import Optional, List, Mapping, Any
	from langchain.embeddings import HuggingFaceEmbeddings

	# ----------- 1. Custom LLM to call your LitServe endpoint -----------
	class LitServeLLM(LLM):
	endpoint_url: str

	def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
	payload = {"prompt": prompt}
	response = requests.post(self.endpoint_url, json=payload)
	if response.status_code == 200:
	data = response.json()
	return data.get("response", "").strip()
	else:
	raise ValueError(f"Request failed: {response.status_code} {response.text}")

	@property
	def _identifying_params(self) -> Mapping[str, Any]:
	return {"endpoint_url": self.endpoint_url}

	@property
	def _llm_type(self) -> str:
	return "litserve_llm"


	# ----------- 2. Connect to Pinecone -----------
	PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
	pc = Pinecone(api_key=PINECONE_API_KEY)
	index = pc.Index("rag-granite-index")

	# ----------- 3. Load embedding model -----------
	embeddings_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

	# ----------- 4. Function to get top context from Pinecone -----------
	def get_retrieved_context(query: str, top_k=3):
	query_embedding = embeddings_model.embed_query(query)
	results = index.query(
	namespace="rag-ns",
	vector=query_embedding,
	top_k=top_k,
	include_metadata=True
	)
	context_parts = [match['metadata']['text'] for match in results['matches']]
	return "\n".join(context_parts)

	# ----------- 5. Create LLMChain with your model -----------
	model = LitServeLLM(
	endpoint_url="https://8001-01k2h9d9mervcmgfn66ybkpwvq.cloudspaces.litng.ai/predict"
	)

	prompt = PromptTemplate(
	input_variables=["context", "question"],
	template="""
	You are a smart assistant. Based on the provided context, answer the question in 1–2 lines only.
	If the context has more details, summarize it concisely.

	Context:
	{context}

	Question: {question}

	Answer:
	"""
	)

	llm_chain = LLMChain(llm=model, prompt=prompt)

	# ----------- 6. Main RAG Function -----------
	def rag_pipeline(question):
	try:
	retrieved_context = get_retrieved_context(question)
	response = llm_chain.invoke({
	"context": retrieved_context,
	"question": question
	})["text"].strip()

	# Only keep what's after "Answer:"
	if "Answer:" in response:
	response = response.split("Answer:", 1)[-1].strip()

	return response
	except Exception as e:
	return f"Error: {str(e)}"


	# ----------- 7. Gradio UI -----------
	with gr.Blocks() as demo:
	gr.Markdown("# 🧠 RAG Chatbot (Pinecone + LitServe)")
	question_input = gr.Textbox(label="Ask your question here")
	answer_output = gr.Textbox(label="Answer")
	ask_button = gr.Button("Get Answer")
	ask_button.click(rag_pipeline, inputs=question_input, outputs=answer_output)

	if __name__ == "__main__":
	demo.launch()
	'''


	'''
	working
	import os
	import gradio as gr
	import requests
	import mlflow
	import dagshub
	from pinecone import Pinecone
	from langchain.prompts import PromptTemplate
	from langchain.chains.llm import LLMChain
	from langchain.llms.base import LLM
	from typing import Optional, List, Mapping, Any
	import time
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from dotenv import load_dotenv
	from datetime import datetime

	# Load environment variables
	pinecone_api_key = os.environ["PINECONE_API_KEY"]

	mlflow_tracking_uri = os.environ["MLFLOW_TRACKING_URI"]

	# ----------- DagsHub & MLflow Setup -----------

	dagshub.init(
	repo_owner='prathamesh.khade20',
	repo_name='Maintenance_AI_website',
	mlflow=True
	)

	mlflow.set_tracking_uri(mlflow_tracking_uri)
	mlflow.set_experiment("Maintenance-RAG-Chatbot")
	mlflow.langchain.autolog()



	# Initialize MLflow run for app configuration
	with mlflow.start_run(run_name=f"App-Config-{datetime.now().strftime('%Y%m%d-%H%M%S')}") as setup_run:
	# Log environment configuration
	mlflow.log_params({
	"pinecone_index": "rag-granite-index",
	"embedding_model": "all-MiniLM-L6-v2",
	"namespace": "rag-ns",
	"top_k": 3,
	"llm_endpoint": "https://8001-01k2h9d9mervcmgfn66ybkpwvq.cloudspaces.litng.ai/predict"
	})

	# Log important files as artifacts

	mlflow.log_text("""
	You are a smart assistant. Based on the provided context, answer the question in 1–2 lines only.
	If the context has more details, summarize it concisely.
	Context:
	{context}
	Question: {question}
	Answer:
	""", "artifacts/prompt_template.txt")

	# ----------- 1. Custom LLM for LitServe endpoint -----------
	class LitServeLLM(LLM):
	endpoint_url: str

	@mlflow.trace
	def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
	payload = {"prompt": prompt}

	with mlflow.start_span("lit_serve_request"):
	start_time = time.time()
	response = requests.post(self.endpoint_url, json=payload)
	latency = time.time() - start_time

	mlflow.log_metric("lit_serve_latency", latency)

	if response.status_code == 200:
	data = response.json()
	mlflow.log_metric("response_tokens", len(data.get("response", "").split()))
	return data.get("response", "").strip()
	else:
	mlflow.log_metric("request_errors", 1)
	error_info = {
	"status_code": response.status_code,
	"error": response.text,
	"timestamp": datetime.now().isoformat()
	}
	mlflow.log_dict(error_info, "artifacts/error_log.json")
	raise ValueError(f"Request failed: {response.status_code}")

	@property
	def _identifying_params(self) -> Mapping[str, Any]:
	return {"endpoint_url": self.endpoint_url}

	@property
	def _llm_type(self) -> str:
	return "litserve_llm"

	# ----------- 2. Pinecone Connection -----------
	@mlflow.trace
	def init_pinecone():
	PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
	pc = Pinecone(api_key=PINECONE_API_KEY)
	return pc.Index("rag-granite-index")

	index = init_pinecone()

	# ----------- 3. Embedding Model -----------
	embeddings_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

	# ----------- 4. Context Retrieval with Tracing -----------
	@mlflow.trace
	def get_retrieved_context(query: str, top_k=3):
	"""Retrieve context from Pinecone with performance tracing"""
	with mlflow.start_span("embedding_generation"):
	start_time = time.time()
	query_embedding = embeddings_model.embed_query(query)
	mlflow.log_metric("embedding_latency", time.time() - start_time)

	with mlflow.start_span("pinecone_query"):
	start_time = time.time()
	results = index.query(
	namespace="rag-ns",
	vector=query_embedding,
	top_k=top_k,
	include_metadata=True
	)
	mlflow.log_metric("pinecone_latency", time.time() - start_time)
	mlflow.log_metric("retrieved_chunks", len(results['matches']))

	context_parts = [match['metadata']['text'] for match in results['matches']]
	return "\n".join(context_parts)

	# ----------- 5. LLM Chain Setup -----------
	model = LitServeLLM(
	endpoint_url="https://8001-01k2h9d9mervcmgfn66ybkpwvq.cloudspaces.litng.ai/predict"
	)

	prompt = PromptTemplate(
	input_variables=["context", "question"],
	template="""
	You are a smart assistant. Based on the provided context, answer the question in 1–2 lines only.
	If the context has more details, summarize it concisely.
	Context:
	{context}
	Question: {question}
	Answer:
	"""
	)

	llm_chain = LLMChain(llm=model, prompt=prompt)

	# ----------- 6. RAG Pipeline with Full Tracing -----------
	@mlflow.trace
	def rag_pipeline(question):
	"""End-to-end RAG pipeline with MLflow tracing"""
	try:
	# Start a new nested run for each query
	with mlflow.start_run(run_name=f"Query-{datetime.now().strftime('%H%M%S')}", nested=True):
	mlflow.log_param("user_question", question)

	# Retrieve context
	retrieved_context = get_retrieved_context(question)
	mlflow.log_text(retrieved_context, "artifacts/retrieved_context.txt")

	# Generate response
	start_time = time.time()
	response = llm_chain.invoke({
	"context": retrieved_context,
	"question": question
	})["text"].strip()

	# Clean response
	if "Answer:" in response:
	response = response.split("Answer:", 1)[-1].strip()

	# Log metrics
	mlflow.log_metric("response_latency", time.time() - start_time)
	mlflow.log_metric("response_length", len(response))
	mlflow.log_text(response, "artifacts/response.txt")

	return response

	except Exception as e:
	mlflow.log_metric("pipeline_errors", 1)
	error_info = {
	"error": str(e),
	"question": question,
	"timestamp": datetime.now().isoformat()
	}
	mlflow.log_dict(error_info, "artifacts/pipeline_errors.json")
	return f"Error: {str(e)}"

	# ----------- 7. Gradio UI with Enhanced Tracking -----------
	with gr.Blocks() as demo:
	gr.Markdown("# 🛠️ Maintenance AI Assistant")

	# Track additional UI metrics
	usage_counter = gr.State(value=0)
	session_start = gr.State(value=datetime.now().isoformat())

	question_input = gr.Textbox(label="Ask your maintenance question")
	answer_output = gr.Textbox(label="AI Response")
	ask_button = gr.Button("Get Answer")
	feedback = gr.Radio(["Helpful", "Not Helpful"], label="Was this response helpful?")

	def track_usage(question, count, session_start, feedback=None):
	"""Wrapper to track usage metrics with feedback"""
	count += 1

	# Start tracking context
	with mlflow.start_run(run_name=f"User-Interaction-{count}", nested=True):
	mlflow.log_param("question", question)
	mlflow.log_param("session_start", session_start)

	# Get response
	response = rag_pipeline(question)

	# Log feedback if provided
	if feedback:
	mlflow.log_param("user_feedback", feedback)
	mlflow.log_metric("helpful_responses", 1 if feedback == "Helpful" else 0)

	# Update metrics
	mlflow.log_metric("total_queries", count)

	return response, count, session_start

	ask_button.click(
	track_usage,
	inputs=[question_input, usage_counter, session_start],
	outputs=[answer_output, usage_counter, session_start]
	)

	feedback.change(
	track_usage,
	inputs=[question_input, usage_counter, session_start, feedback],
	outputs=[answer_output, usage_counter, session_start]
	)

	if __name__ == "__main__":
	# Log deployment information
	with mlflow.start_run(run_name="Deployment-Info"):
	mlflow.log_params({
	"app_version": "1.0.0",
	"deployment_platform": "Lightning AI",
	"deployment_time": datetime.now().isoformat(),
	"code_version": os.getenv("GIT_COMMIT", "dev")
	})

	# Start Gradio app
	demo.launch()

	'''

	import os
	import gradio as gr
	import requests
	import mlflow
	import dagshub
	from pinecone import Pinecone
	from langchain.prompts import PromptTemplate
	from langchain.chains.llm import LLMChain
	from langchain.llms.base import LLM
	from typing import Optional, List, Mapping, Any
	import time
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from dotenv import load_dotenv
	from datetime import datetime

	# DeepEval imports
	try:
	from deepeval.test_case import LLMTestCase
	from deepeval.metrics import AnswerRelevancyMetric, HallucinationMetric
	from deepeval.metrics import BaseMetric
	from deepeval.models.base_model import DeepEvalBaseLLM
	except Exception:
	raise

	# Optional LangChain Google generative integration (Gemini)
	try:
	import google.generativeai as genai
	from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
	except Exception:
	ChatGoogleGenerativeAI = None
	genai = None

	# Load environment variables
	load_dotenv()

	PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY", "")
	MLFLOW_TRACKING_URI = os.environ.get("MLFLOW_TRACKING_URI", "http://localhost:5000")
	GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY", "")
	LITSERVE_ENDPOINT = os.environ.get("LITSERVE_ENDPOINT", "https://8001-01k2h9d9mervcmgfn66ybkpwvq.cloudspaces.litng.ai/predict")

	# DagsHub & MLflow Setup (guarded)
	try:
	dagshub.init(
	repo_owner='prathamesh.khade20',
	repo_name='Maintenance_AI_website',
	mlflow=True
	)
	except Exception:
	pass

	mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
	mlflow.set_experiment("Maintenance-RAG-Chatbot")

	# ----------- App configuration logging -----------
	with mlflow.start_run(run_name=f"App-Config-{datetime.now().strftime('%Y%m%d-%H%M%S')}") as setup_run:
	mlflow.log_params({
	"pinecone_index": "rag-granite-index",
	"embedding_model": "all-MiniLM-L6-v2",
	"namespace": "rag-ns",
	"top_k": 3,
	"llm_endpoint": LITSERVE_ENDPOINT
	})
	mlflow.log_text("""
	You are a smart assistant. Based on the provided context, answer the question in 1–2 lines only.
	If the context has more details, summarize it concisely.
	Context:
	{context}
	Question: {question}
	Answer:
	""", "artifacts/prompt_template.txt")

	# ----------- 1. Custom LLM for LitServe endpoint (Lightning AI) -----------
	class LitServeLLM(LLM):
	endpoint_url: str

	@mlflow.trace
	def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
	payload = {"prompt": prompt}
	with mlflow.start_span("lit_serve_request"):
	start_time = time.time()
	response = requests.post(self.endpoint_url, json=payload)
	latency = time.time() - start_time
	mlflow.log_metric("lit_serve_latency", latency)
	if response.status_code == 200:
	data = response.json()
	mlflow.log_metric("response_tokens", len(data.get("response", "").split()))
	return data.get("response", "").strip()
	else:
	mlflow.log_metric("request_errors", 1)
	error_info = {
	"status_code": response.status_code,
	"error": response.text,
	"timestamp": datetime.now().isoformat()
	}
	mlflow.log_dict(error_info, "artifacts/error_log.json")
	raise ValueError(f"Request failed: {response.status_code}")

	@property
	def _identifying_params(self) -> Mapping[str, Any]:
	return {"endpoint_url": self.endpoint_url}

	@property
	def _llm_type(self) -> str:
	return "litserve_llm"

	# ----------- 2. Pinecone Connection -----------
	@mlflow.trace
	def init_pinecone():
	PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
	pc = Pinecone(api_key=PINECONE_API_KEY)
	return pc.Index("rag-granite-index")

	try:
	index = init_pinecone()
	except Exception:
	index = None

	# ----------- 3. Embedding Model -----------
	embeddings_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

	# ----------- 4. Context Retrieval with Tracing -----------
	@mlflow.trace
	def get_retrieved_context(query: str, top_k=3):
	with mlflow.start_span("embedding_generation"):
	start_time = time.time()
	query_embedding = embeddings_model.embed_query(query)
	mlflow.log_metric("embedding_latency", time.time() - start_time)

	if index is None:
	return ""

	with mlflow.start_span("pinecone_query"):
	start_time = time.time()
	results = index.query(
	namespace="rag-ns",
	vector=query_embedding,
	top_k=top_k,
	include_metadata=True
	)
	mlflow.log_metric("pinecone_latency", time.time() - start_time)
	mlflow.log_metric("retrieved_chunks", len(results['matches']))

	context_parts = [match['metadata']['text'] for match in results['matches']]
	return "
	".join(context_parts)

	with mlflow.start_span("pinecone_query"):
	start_time = time.time()
	results = index.query(
	namespace="rag-ns",
	vector=query_embedding,
	top_k=top_k,
	include_metadata=True
	)
	mlflow.log_metric("pinecone_latency", time.time() - start_time)
	mlflow.log_metric("retrieved_chunks", len(results['matches']))

	context_parts = [match['metadata']['text'] for match in results['matches']]
	return "
	".join(context_parts)

	# ----------- 5. LLM Chain Setup (Lightning AI generator) -----------
	model = LitServeLLM(endpoint_url=LITSERVE_ENDPOINT)

	prompt = PromptTemplate(
	input_variables=["context", "question"],
	template="""
	You are a smart assistant. Based on the provided context, answer the question in 1–2 lines only.
	If the context has more details, summarize it concisely.
	Context:
	{context}
	Question: {question}
	Answer:
	"""
	)

	llm_chain = LLMChain(llm=model, prompt=prompt)

	# ----------- 6. RAG Pipeline with Full Tracing (uses Lightning AI) -----------
	@mlflow.trace
	def rag_pipeline(question):
	try:
	with mlflow.start_run(run_name=f"Query-{datetime.now().strftime('%H%M%S')}", nested=True):
	mlflow.log_param("user_question", question)
	retrieved_context = get_retrieved_context(question)
	mlflow.log_text(retrieved_context, "artifacts/retrieved_context.txt")

	start_time = time.time()
	response_obj = llm_chain.invoke({
	"context": retrieved_context,
	"question": question
	})
	response = response_obj.get("text") if isinstance(response_obj, dict) else getattr(response_obj, "text", str(response_obj))
	response = response.strip()

	if "Answer:" in response:
	response = response.split("Answer:", 1)[-1].strip()

	mlflow.log_metric("response_latency", time.time() - start_time)
	mlflow.log_metric("response_length", len(response))
	mlflow.log_text(response, "artifacts/response.txt")

	return response
	except Exception as e:
	mlflow.log_metric("pipeline_errors", 1)
	error_info = {
	"error": str(e),
	"question": question,
	"timestamp": datetime.now().isoformat()
	}
	mlflow.log_dict(error_info, "artifacts/pipeline_errors.json")
	return f"Error: {str(e)}"

	# ----------- 7. DeepEval Wrapper(s) and Metrics Integration (Gemini evaluation) -----------
	class GoogleVertexAI(DeepEvalBaseLLM):
	def __init__(self, model):
	self.model = model

	def load_model(self):
	return self.model

	def generate(self, prompt: str) -> str:
	chat_model = self.load_model()
	res = chat_model.invoke(prompt)
	if hasattr(res, 'content'):
	return res.content
	if isinstance(res, dict):
	return res.get('content') or res.get('text') or str(res)
	return str(res)

	async def a_generate(self, prompt: str) -> str:
	chat_model = self.load_model()
	res = await chat_model.ainvoke(prompt)
	return getattr(res, 'content', str(res))

	def get_model_name(self):
	return "Vertex AI Model"

	class LitServeWrapper(DeepEvalBaseLLM):
	def __init__(self, lit_llm: LitServeLLM):
	self.lit_llm = lit_llm

	def load_model(self):
	return self.lit_llm

	def generate(self, prompt: str) -> str:
	return self.lit_llm._call(prompt)

	async def a_generate(self, prompt: str) -> str:
	return self.generate(prompt)

	def get_model_name(self):
	return "LitServeModel"

	# Custom metric that DOES NOT require expected_output: Length-based utility metric
	class LengthMetric(BaseMetric):
	def __init__(self, min_tokens: int = 1, max_tokens: int = 200):
	self.min_tokens = min_tokens
	self.max_tokens = max_tokens
	self.score = 0.0
	self.success = False

	def measure(self, test_case: LLMTestCase):
	text = (test_case.actual_output or "")
	tokens = len(text.split())
	mid = (self.min_tokens + self.max_tokens) / 2
	dist = abs(tokens - mid)
	max_dist = max(mid - self.min_tokens, self.max_tokens - mid)
	self.score = max(0.0, 1.0 - (dist / max_dist))
	self.success = (self.min_tokens <= tokens <= self.max_tokens)
	return self.score

	async def a_measure(self, test_case: LLMTestCase):
	return self.measure(test_case)

	def is_successful(self):
	return self.success

	@property
	def name(self):
	return "Length Metric"

	# Helper to get eval model: GEMINI will be used as evaluator by default
	def get_deepeval_model(choice: str = 'gemini'):
	if choice == 'gemini' and ChatGoogleGenerativeAI is not None and GOOGLE_API_KEY:
	try:
	genai.configure(api_key=GOOGLE_API_KEY)
	except Exception:
	pass
	chat_model = ChatGoogleGenerativeAI(model="gemini-pro", google_api_key=GOOGLE_API_KEY)
	return GoogleVertexAI(model=chat_model)
	else:
	# fallback to litserve wrapper if gemini isn't available
	return LitServeWrapper(lit_llm=model)

	# Function to run Deepeval tests and log to mlflow (only metrics that don't need expected_output)
	@mlflow.trace
	def run_deepeval_tests(test_cases: List[LLMTestCase], eval_model_choice: str = 'gemini'):
	model_wrapper = get_deepeval_model(eval_model_choice)

	# Only metrics that do not require expected output
	answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.5, model=model_wrapper)
	hallucination_metric = HallucinationMetric(threshold=0.5, model=model_wrapper)
	length_metric = LengthMetric(min_tokens=3, max_tokens=200)

	results = []
	with mlflow.start_run(run_name=f"DeepEval-{datetime.now().strftime('%H%M%S')}", nested=True):
	for i, tc in enumerate(test_cases):
	mlflow.log_param(f"tc_{i}_input", tc.input)
	mlflow.log_param(f"tc_{i}_actual", tc.actual_output)
	if tc.context:
	mlflow.log_text("
	".join(tc.context), f"artifacts/tc_{i}_context.txt")

	# Measure metrics
	answer_relevancy_metric.measure(tc)
	hallucination_metric.measure(tc)
	length_metric.measure(tc)

	entry = {
	"input": tc.input,
	"actual_output": tc.actual_output,
	"context": tc.context,
	"answer_relevancy_score": answer_relevancy_metric.score,
	"hallucination_score": hallucination_metric.score,
	"length_score": length_metric.score
	}

	# Log metrics to mlflow
	mlflow.log_metric(f"tc_{i}_answer_relevancy", answer_relevancy_metric.score)
	mlflow.log_metric(f"tc_{i}_hallucination", hallucination_metric.score)
	mlflow.log_metric(f"tc_{i}_length", length_metric.score)

	results.append(entry)

	return results

	# ----------- 8. Gradio UI with Evaluation Tab (Auto-generate actual output from Lightning AI) -----------
	with gr.Blocks() as demo:
	gr.Markdown("# 🛠️ Maintenance AI Assistant + DeepEval (Lightning AI generator, Gemini evaluator)")

	with gr.Tabs():
	with gr.TabItem("Chat (RAG)"):
	usage_counter = gr.State(value=0)
	session_start = gr.State(value=datetime.now().isoformat())

	question_input = gr.Textbox(label="Ask your maintenance question")
	answer_output = gr.Textbox(label="AI Response")
	ask_button = gr.Button("Get Answer")
	feedback = gr.Radio(["Helpful", "Not Helpful"], label="Was this response helpful?")

	def track_usage(question, count, session_start, feedback=None):
	count += 1
	with mlflow.start_run(run_name=f"User-Interaction-{count}", nested=True):
	mlflow.log_param("question", question)
	mlflow.log_param("session_start", session_start)
	response = rag_pipeline(question)
	if feedback:
	mlflow.log_param("user_feedback", feedback)
	mlflow.log_metric("helpful_responses", 1 if feedback == "Helpful" else 0)
	mlflow.log_metric("total_queries", count)
	return response, count, session_start

	ask_button.click(
	track_usage,
	inputs=[question_input, usage_counter, session_start],
	outputs=[answer_output, usage_counter, session_start]
	)

	feedback.change(
	track_usage,
	inputs=[question_input, usage_counter, session_start, feedback],
	outputs=[answer_output, usage_counter, session_start]
	)

	with gr.TabItem("DeepEval — Model Tests"):
	gr.Markdown("### Run DeepEval metrics (no expected output needed). Provide input; optionally auto-generate the model response (Lightning AI). Gemini will evaluate by default.")

	tc_input = gr.Textbox(label="Test Input (prompt)")
	tc_actual = gr.Textbox(label="Actual Output (paste model response or leave empty to auto-generate)")
	tc_context = gr.Textbox(label="Context (optional)")

	auto_generate = gr.Checkbox(label="Auto-generate actual output from RAG (Lightning AI)", value=True)
	model_choice = gr.Radio(["gemini", "litserve"], value="gemini", label="Evaluation backend (Gemini recommended)")
	run_button = gr.Button("Run DeepEval")
	eval_output = gr.JSON(label="Evaluation Results")

	def run_single_eval(inp, actual, context, autogen, eval_backend):
	# If autogen is True, generate actual output via RAG pipeline (Lightning AI)
	if autogen or (actual is None or actual.strip() == ""):
	generated = rag_pipeline(inp)
	actual_output = generated
	else:
	actual_output = actual

	# Log that actual was autogenerated
	with mlflow.start_run(run_name=f"DE-Run-{datetime.now().strftime('%H%M%S')}", nested=True):
	mlflow.log_param("input", inp)
	mlflow.log_param("autogenerated_actual", autogen)
	if context:
	mlflow.log_text(context, "artifacts/eval_context.txt")

	tc = LLMTestCase(input=inp, actual_output=actual_output, expected_output=None, context=[context] if context else None)
	results = run_deepeval_tests([tc], eval_model_choice=eval_backend)
	return results

	run_button.click(
	run_single_eval,
	inputs=[tc_input, tc_actual, tc_context, auto_generate, model_choice],
	outputs=[eval_output]
	)

	if __name__ == "__main__":
	with mlflow.start_run(run_name="Deployment-Info"):
	mlflow.log_params({
	"app_version": "1.3.0",
	"deployment_platform": "Lightning AI / HuggingFace Space",
	"deployment_time": datetime.now().isoformat(),
	"code_version": os.getenv("GIT_COMMIT", "dev")
	})

	demo.launch()