policy-rag / policy_rag /sdg_utils.py
lw2134's picture
Upload 27 files
fa3eb69 verified
import os
from ragas.testset.generator import TestsetGenerator
from ragas.testset.generator import TestDataset
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.documents.base import Document
from typing import List
from langsmith import Client
from pandas import DataFrame
import asyncio
async def ragas_sdg(
context_docs: List[Document],
n_qa_pairs: int = 20,
embedding_model: OpenAIEmbeddings = OpenAIEmbeddings(model='text-embedding-3-large')
) -> TestDataset:
generator_llm = ChatOpenAI(model="gpt-4o")
critic_llm = ChatOpenAI(model="gpt-4o-mini")
embeddings = embedding_model
generator = TestsetGenerator.from_langchain(
generator_llm,
critic_llm,
embeddings
)
distributions = {
simple: 0.5,
multi_context: 0.25,
reasoning: 0.25
}
test_set = generator.generate_with_langchain_docs(context_docs, n_qa_pairs, distributions)
return test_set
def upload_dataset_langsmith(
dataset: TestDataset | DataFrame,
dataset_name: str,
description: str
) -> None:
client = Client()
ls_dataset = client.create_dataset(
dataset_name=dataset_name, description=description
)
# TODO: implement a Pydantic model to validate input dataset
if type(dataset) == TestDataset:
dataset_df = dataset.to_pandas()
elif type(dataset) == DataFrame:
dataset_df = dataset
else:
raise TypeError('Dataset must be ragas TestDataset or pandas DataFrame')
for idx, row in dataset_df.iterrows():
client.create_example(
inputs={"question" : row["question"], "context": row["contexts"]},
outputs={"ground_truth" : row["ground_truth"]},
metadata={'metadata': row['metadata'][0], "evolution_type": row['evolution_type']},
dataset_id=ls_dataset.id
)