Spaces:
Running
Running
import os | |
from dotenv import load_dotenv | |
import nltk | |
from typing import List | |
from ragas.llms import LangchainLLMWrapper | |
from ragas.embeddings import LangchainEmbeddingsWrapper | |
from ragas.testset import TestsetGenerator | |
from ragas.metrics import ( | |
LLMContextRecall, | |
Faithfulness, | |
FactualCorrectness, | |
ResponseRelevancy, | |
ContextEntityRecall, | |
NoiseSensitivity | |
) | |
from ragas import evaluate, RunConfig, EvaluationDataset | |
# Load environment variables | |
load_dotenv() | |
# Ensure OpenAI API key is set | |
if not os.getenv("OPENAI_API_KEY"): | |
raise ValueError("OPENAI_API_KEY not found in environment variables") | |
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY") | |
os.environ["RAGAS_APP_TOKEN"] = os.getenv("RAGAS_APP_TOKEN") | |
from langchain_openai import ChatOpenAI, OpenAIEmbeddings | |
from langchain_community.document_loaders import WebBaseLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_qdrant import QdrantVectorStore | |
from qdrant_client import QdrantClient | |
from qdrant_client.http.models import Distance, VectorParams | |
from langchain.prompts import ChatPromptTemplate | |
# Initialize the URLs (same as app.py) | |
urls = [ | |
"https://www.timeout.com/london/things-to-do-in-london-this-weekend", | |
"https://www.timeout.com/london/london-events-in-march" | |
] | |
# Load documents | |
loader = WebBaseLoader(urls) | |
docs = loader.load() | |
# Initialize generator models for RAGAS | |
generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4")) | |
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings()) | |
# Generate synthetic test dataset | |
generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings) | |
dataset = generator.generate_with_langchain_docs(docs, testset_size=10) | |
print(dataset.upload()) | |
# Print the generated test questions | |
print("\nGenerated Test Questions:") | |
for i, test_row in enumerate(dataset): | |
print(f"{i+1}. {test_row.eval_sample.user_input}") | |
# Set up the RAG pipeline for testing | |
# Split documents | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
split_documents = text_splitter.split_documents(docs) | |
# Create vector store | |
embeddings = OpenAIEmbeddings(model="text-embedding-3-small") | |
client = QdrantClient(":memory:") | |
client.create_collection( | |
collection_name="london_events", | |
vectors_config=VectorParams(size=1536, distance=Distance.COSINE), | |
) | |
vector_store = QdrantVectorStore( | |
client=client, | |
collection_name="london_events", | |
embedding=embeddings, | |
) | |
# Add documents to vector store | |
vector_store.add_documents(documents=split_documents) | |
retriever = vector_store.as_retriever(search_kwargs={"k": 5}) | |
# Create RAG prompt | |
RAG_PROMPT = """ | |
You are a helpful assistant who answers questions about events and activities in London. | |
Answer based only on the provided context. If you cannot find the answer, say so. | |
Question: {question} | |
Context: {context} | |
Answer:""" | |
rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT) | |
llm = ChatOpenAI(model="gpt-4o-mini") | |
# Process each test question through the RAG pipeline | |
for test_row in dataset: | |
# Retrieve relevant documents | |
retrieved_docs = retriever.get_relevant_documents(test_row.eval_sample.user_input) | |
# Format context and generate response | |
context = "\n\n".join(doc.page_content for doc in retrieved_docs) | |
messages = rag_prompt.format_messages(question=test_row.eval_sample.user_input, context=context) | |
response = llm.invoke(messages) | |
# Store results in dataset | |
test_row.eval_sample.response = response.content | |
test_row.eval_sample.retrieved_contexts = [doc.page_content for doc in retrieved_docs] | |
# Convert to evaluation dataset | |
evaluation_dataset = EvaluationDataset.from_pandas(dataset.to_pandas()) | |
# Set up evaluator | |
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini")) | |
# Run evaluation with all metrics | |
custom_run_config = RunConfig(timeout=360) | |
result = evaluate( | |
dataset=evaluation_dataset, | |
metrics=[ | |
LLMContextRecall(), | |
Faithfulness(), | |
FactualCorrectness(), | |
ResponseRelevancy(), | |
ContextEntityRecall(), | |
NoiseSensitivity() | |
], | |
llm=evaluator_llm, | |
run_config=custom_run_config | |
) | |
print("RAW RESULT: ", result) | |
print("Type of result: ", type(result)) | |
# Convert to pandas DataFrame for better formatting | |
df = result.to_pandas() | |
print("\nEvaluation Results as DataFrame:") | |
print(df) | |
try: | |
# Try to save as markdown | |
print("Attempting to save as markdown...") | |
import tabulate # Try to import explicitly to verify installation | |
df.to_markdown("docs/evaluation_results.md", index=False) | |
print("Successfully saved as markdown!") | |
except ImportError as e: | |
# Print detailed error message | |
print(f"Import Error: {e}") | |
print("Note: Install 'tabulate' package for markdown output. Falling back to CSV format.") | |
df.to_csv("docs/evaluation_results.csv", index=False) | |
# Save test questions | |
with open("docs/test_questions.md", "w") as f: | |
f.write("# Test Questions\n\n") | |
for i, test_row in enumerate(dataset): | |
f.write(f"{i+1}. {test_row.eval_sample.user_input}\n") | |