pythonic-rag / scripts /evaluate_rag.py
ric9176's picture
Add and run evaluation comparision and move evals to scrpits folder
a11e7e8
import os
from dotenv import load_dotenv
import nltk
from typing import List
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.testset import TestsetGenerator
from ragas.metrics import (
LLMContextRecall,
Faithfulness,
FactualCorrectness,
ResponseRelevancy,
ContextEntityRecall,
NoiseSensitivity
)
from ragas import evaluate, RunConfig, EvaluationDataset
# Load environment variables
load_dotenv()
# Ensure OpenAI API key is set
if not os.getenv("OPENAI_API_KEY"):
raise ValueError("OPENAI_API_KEY not found in environment variables")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["RAGAS_APP_TOKEN"] = os.getenv("RAGAS_APP_TOKEN")
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from langchain.prompts import ChatPromptTemplate
# Initialize the URLs (same as app.py)
urls = [
"https://www.timeout.com/london/things-to-do-in-london-this-weekend",
"https://www.timeout.com/london/london-events-in-march"
]
# Load documents
loader = WebBaseLoader(urls)
docs = loader.load()
# Initialize generator models for RAGAS
generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4"))
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())
# Generate synthetic test dataset
generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
dataset = generator.generate_with_langchain_docs(docs, testset_size=10)
print(dataset.upload())
# Print the generated test questions
print("\nGenerated Test Questions:")
for i, test_row in enumerate(dataset):
print(f"{i+1}. {test_row.eval_sample.user_input}")
# Set up the RAG pipeline for testing
# Split documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_documents = text_splitter.split_documents(docs)
# Create vector store
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
client = QdrantClient(":memory:")
client.create_collection(
collection_name="london_events",
vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)
vector_store = QdrantVectorStore(
client=client,
collection_name="london_events",
embedding=embeddings,
)
# Add documents to vector store
vector_store.add_documents(documents=split_documents)
retriever = vector_store.as_retriever(search_kwargs={"k": 5})
# Create RAG prompt
RAG_PROMPT = """
You are a helpful assistant who answers questions about events and activities in London.
Answer based only on the provided context. If you cannot find the answer, say so.
Question: {question}
Context: {context}
Answer:"""
rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)
llm = ChatOpenAI(model="gpt-4o-mini")
# Process each test question through the RAG pipeline
for test_row in dataset:
# Retrieve relevant documents
retrieved_docs = retriever.get_relevant_documents(test_row.eval_sample.user_input)
# Format context and generate response
context = "\n\n".join(doc.page_content for doc in retrieved_docs)
messages = rag_prompt.format_messages(question=test_row.eval_sample.user_input, context=context)
response = llm.invoke(messages)
# Store results in dataset
test_row.eval_sample.response = response.content
test_row.eval_sample.retrieved_contexts = [doc.page_content for doc in retrieved_docs]
# Convert to evaluation dataset
evaluation_dataset = EvaluationDataset.from_pandas(dataset.to_pandas())
# Set up evaluator
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
# Run evaluation with all metrics
custom_run_config = RunConfig(timeout=360)
result = evaluate(
dataset=evaluation_dataset,
metrics=[
LLMContextRecall(),
Faithfulness(),
FactualCorrectness(),
ResponseRelevancy(),
ContextEntityRecall(),
NoiseSensitivity()
],
llm=evaluator_llm,
run_config=custom_run_config
)
print("RAW RESULT: ", result)
print("Type of result: ", type(result))
# Convert to pandas DataFrame for better formatting
df = result.to_pandas()
print("\nEvaluation Results as DataFrame:")
print(df)
try:
# Try to save as markdown
print("Attempting to save as markdown...")
import tabulate # Try to import explicitly to verify installation
df.to_markdown("docs/evaluation_results.md", index=False)
print("Successfully saved as markdown!")
except ImportError as e:
# Print detailed error message
print(f"Import Error: {e}")
print("Note: Install 'tabulate' package for markdown output. Falling back to CSV format.")
df.to_csv("docs/evaluation_results.csv", index=False)
# Save test questions
with open("docs/test_questions.md", "w") as f:
f.write("# Test Questions\n\n")
for i, test_row in enumerate(dataset):
f.write(f"{i+1}. {test_row.eval_sample.user_input}\n")