Spaces:

lw2134
/

policy-rag

Sleeping

App Files Files Community

lw2134 commited on Oct 9, 2024

Commit

fa3eb69

verified ·

1 Parent(s): c62a3d3

Upload 27 files

Browse files

Files changed (27) hide show

policy_rag/__init__.py +0 -0
policy_rag/__pycache__/__init__.cpython-311.pyc +0 -0
policy_rag/__pycache__/app_utils.cpython-311.pyc +0 -0
policy_rag/__pycache__/chains.cpython-311.pyc +0 -0
policy_rag/__pycache__/data_models.cpython-311.pyc +0 -0
policy_rag/__pycache__/eval_utils.cpython-311.pyc +0 -0
policy_rag/__pycache__/ragas_utils.cpython-311.pyc +0 -0
policy_rag/__pycache__/sdg_utils.cpython-311.pyc +0 -0
policy_rag/__pycache__/text_utils.cpython-311.pyc +0 -0
policy_rag/__pycache__/vectorstore_utils.cpython-311.pyc +0 -0
policy_rag/app_utils.py +55 -0
policy_rag/chains.py +47 -0
policy_rag/data_models.py +50 -0
policy_rag/eval_utils.py +68 -0
policy_rag/metrics/__init__.py +4 -0
policy_rag/metrics/__pycache__/__init__.cpython-311.pyc +0 -0
policy_rag/metrics/__pycache__/_answer_relevancy.cpython-311.pyc +0 -0
policy_rag/metrics/__pycache__/_context_precision.cpython-311.pyc +0 -0
policy_rag/metrics/__pycache__/_context_recall.cpython-311.pyc +0 -0
policy_rag/metrics/__pycache__/_faithfulness.cpython-311.pyc +0 -0
policy_rag/metrics/_answer_relevancy.py +126 -0
policy_rag/metrics/_context_precision.py +75 -0
policy_rag/metrics/_context_recall.py +95 -0
policy_rag/metrics/_faithfulness.py +98 -0
policy_rag/sdg_utils.py +68 -0
policy_rag/text_utils.py +60 -0
policy_rag/vectorstore_utils.py +115 -0

policy_rag/__init__.py ADDED Viewed

File without changes

policy_rag/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (164 Bytes). View file

policy_rag/__pycache__/app_utils.cpython-311.pyc ADDED Viewed

Binary file (2.17 kB). View file

policy_rag/__pycache__/chains.cpython-311.pyc ADDED Viewed

Binary file (1.9 kB). View file

policy_rag/__pycache__/data_models.cpython-311.pyc ADDED Viewed

Binary file (3.09 kB). View file

policy_rag/__pycache__/eval_utils.cpython-311.pyc ADDED Viewed

Binary file (2.83 kB). View file

policy_rag/__pycache__/ragas_utils.cpython-311.pyc ADDED Viewed

Binary file (1.52 kB). View file

policy_rag/__pycache__/sdg_utils.cpython-311.pyc ADDED Viewed

Binary file (2.98 kB). View file

policy_rag/__pycache__/text_utils.cpython-311.pyc ADDED Viewed

Binary file (3.21 kB). View file

policy_rag/__pycache__/vectorstore_utils.cpython-311.pyc ADDED Viewed

Binary file (6.4 kB). View file

policy_rag/app_utils.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import os
+from dotenv import load_dotenv
+load_dotenv()
+from typing import Dict, Tuple
+from collections.abc import Callable
+from langchain_openai import OpenAIEmbeddings
+from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
+from policy_rag.text_utils import get_recursive_token_chunks, get_semantic_chunks
+# Config Options
+CHUNK_METHOD = {
+    'token-overlap': get_recursive_token_chunks,
+    'semantic': get_semantic_chunks
+}
+EMBEDDING_MODEL_SOURCE = {
+    'openai': OpenAIEmbeddings,
+    'huggingface': HuggingFaceInferenceAPIEmbeddings
+}
+# Helpers
+def get_chunk_func(chunk_method: Dict) -> Tuple[Callable, Dict]:
+    chunk_func = CHUNK_METHOD[chunk_method['method']]
+    if chunk_method['method'] == 'token-overlap':
+        chunk_func_args = chunk_method['args']
+    if chunk_method['method'] == 'semantic':
+        args = chunk_method['args']
+        chunk_func_args = {
+            'embedding_model': EMBEDDING_MODEL_SOURCE[args['model_source']](model=args['model_name']),
+            'breakpoint_type': args['breakpoint_type']
+        }
+    return chunk_func, chunk_func_args
+def get_embedding_model(config) -> OpenAIEmbeddings | HuggingFaceInferenceAPIEmbeddings:
+    if config['model_source'] == 'openai':
+        model = EMBEDDING_MODEL_SOURCE[config['model_source']](model=config['model_name'])
+    if config['model_source'] == 'huggingface':
+        model = EMBEDDING_MODEL_SOURCE[config['model_source']](
+            api_key=os.getenv('HF_API_KEY'),
+            model_name=config['model_name'],
+            api_url=config['api_url']
+        )
+    return model

policy_rag/chains.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from operator import itemgetter
+from langchain_openai import ChatOpenAI
+from langchain.chains.base import Chain
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables import RunnablePassthrough
+from langchain.prompts import ChatPromptTemplate
+from langchain_core.vectorstores import VectorStoreRetriever
+def get_qa_chain(
+        retriever: VectorStoreRetriever,
+        streaming: bool = False
+    ) -> Chain:
+    template = """
+    Answer any questions based solely on the context below. If the context
+    doesn't provide the answer, still do your best to answer the question
+    factually, but indicate there isn't a clear answer in the context
+    and that you're giving a best-effort response.
+    Question:
+    {question}
+    Context:
+    {context}
+    """
+    primary_qa_llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0, streaming=streaming)
+    prompt = ChatPromptTemplate.from_template(template)
+    retrieval_augmented_qa_chain = (
+        # INVOKE CHAIN WITH: {"question" : "<<SOME USER QUESTION>>"}
+        # "question" : populated by getting the value of the "question" key
+        # "context"  : populated by getting the value of the "question" key and chaining it into the base_retriever
+        {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
+        # "context"  : is assigned to a RunnablePassthrough object (will not be called or considered in the next step)
+        #              by getting the value of the "context" key from the previous step
+        | RunnablePassthrough.assign(context=itemgetter("context"))
+        # "answer" : the "context" and "question" values are used to format our prompt object and then piped
+        #              into the LLM and stored in a key called "answer": NOTE: Key MUST be "answer" for LangSmith.
+        # "contexts"  : populated by getting the value of the "context" key from the previous step.
+        #               NOTE: Key must be "contexts" for LangSmith
+        | {"answer": prompt | primary_qa_llm, "contexts": itemgetter("context")}
+    )
+    return retrieval_augmented_qa_chain

policy_rag/data_models.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from pydantic import BaseModel, RootModel, field_validator
+from langchain_core.documents.base import Document
+from typing import List, Dict
+from uuid import UUID
+class DocList(RootModel[List[Document]]):
+    model_config = {'validate_assignment': True}
+class QuestionObject(RootModel[Dict[str, str]]):
+    model_config = {'validate_assignment': True}
+    @field_validator('root')
+    def validate_key_is_uuid(cls, value):
+        for key in value.keys():
+            try:
+                u = UUID(key)
+                if u.version != 4:
+                    raise ValueError(f"{key} is not UUID v4")
+            except ValueError as e:
+                raise ValueError(f"{key} is not UUID v4")
+        return value
+class ContextObject(RootModel[Dict[str, List[str]]]):
+    model_config = {'validate_assignment': True}
+    @field_validator('root')
+    def validate_key_is_uuid(cls, value):
+        for key in value.keys():
+            try:
+                u = UUID(key)
+                if u.version != 4:
+                    raise ValueError(f"{key} is not UUID v4")
+            except ValueError as e:
+                raise ValueError(f"{key} is not UUID v4")
+        return value
+    @field_validator('root')
+    def validate_values_are_uuid(cls, value):
+        for key, val in value.items():
+            for v in val:
+                try:
+                    u = UUID(v)
+                    if u.version != 4:
+                        raise ValueError(f"{key} is not UUID v4")
+                except:
+                    raise ValueError(f"{key} is not UUID v4")
+        return value

policy_rag/eval_utils.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import os
+from dotenv import load_dotenv
+load_dotenv()
+from typing import List, Any
+from langsmith import Client
+from langsmith.evaluation import evaluate
+from langchain_core.vectorstores import VectorStoreRetriever
+import pandas as pd
+import uuid
+from policy_rag.chains import get_qa_chain
+from policy_rag.metrics import (
+    faithfulness,
+    answer_relevancy,
+    context_precision,
+    context_recall
+)
+METRICS = {
+    'faithfulness': faithfulness,
+    'answer_relevancy': answer_relevancy,
+    'context_precision': context_precision,
+    'context_recall': context_recall
+}
+def get_ls_dataset(ls_dataset_name: str) -> pd.DataFrame:
+    client = Client()
+    examples = client.list_examples(dataset_name=ls_dataset_name)
+    rows = [row.outputs | row.inputs | {'id': str(row.id)} for row in examples]
+    return pd.DataFrame(rows)
+# Get RAG QA Chain
+def eval_on_ls_dataset(
+        metrics: List[str],
+        retriever: VectorStoreRetriever,
+        ls_dataset_name: str,
+        ls_project_name: str,
+        ls_experiment_name: str
+    ):
+    os.environ['LANGCHAIN_PROJECT'] = ls_project_name
+    print('Getting RAG QA Chain')
+    rag_qa_chain = get_qa_chain(retriever=retriever)
+    # Get LS Dataset and Eval Dataset
+    #print('Getting Test Set from LangSmith')
+    #test_df = get_ls_dataset(ls_dataset_name)
+    #test_questions = test_df['question'].to_list()
+    #test_groundtruths = test_df['ground_truth'].to_list()
+    # Evaluate
+    print('Running Experiment in LangSmith')
+    print(f'Evaluating {metrics}')
+    client = Client(auto_batch_tracing=False)
+    results = evaluate(
+        rag_qa_chain.invoke,
+        data=ls_dataset_name,
+        evaluators=[METRICS[metric] for metric in metrics],
+        experiment_prefix=ls_experiment_name,
+        client=client
+    )
+    return results

policy_rag/metrics/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from ._faithfulness import faithfulness
+from ._answer_relevancy import answer_relevancy
+from ._context_precision import context_precision
+from ._context_recall import context_recall

policy_rag/metrics/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (456 Bytes). View file

policy_rag/metrics/__pycache__/_answer_relevancy.cpython-311.pyc ADDED Viewed

Binary file (6.25 kB). View file

policy_rag/metrics/__pycache__/_context_precision.cpython-311.pyc ADDED Viewed

Binary file (3.54 kB). View file

policy_rag/metrics/__pycache__/_context_recall.cpython-311.pyc ADDED Viewed

Binary file (4.74 kB). View file

policy_rag/metrics/__pycache__/_faithfulness.cpython-311.pyc ADDED Viewed

Binary file (4.74 kB). View file

policy_rag/metrics/_answer_relevancy.py ADDED Viewed

	@@ -0,0 +1,126 @@

+from dotenv import load_dotenv
+load_dotenv()
+import json
+from typing import List, Tuple
+import numpy as np
+from langsmith.schemas import Example, Run
+from pydantic import BaseModel, Field
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_openai import ChatOpenAI
+from langchain_core.output_parsers import PydanticToolsParser
+from langchain_openai import OpenAIEmbeddings
+class VariantQuestionAnswerCommittal(BaseModel):
+    """Use to generate a question based on the given answer
+    and determine if the answer is noncommittal."""
+    question: str = Field(description="The generated question based on the given answer.")
+    noncommittal: bool = Field(description="The judgement of if the answer is noncommittal.")
+def cosine_similarity_np(embedding_a, embedding_b):
+    """
+    Calculate the cosine similarity between two vectors using numpy.
+    Args:
+    - embedding_a (np.array): First embedding vector.
+    - embedding_b (np.array): Second embedding vector.
+    Returns:
+    - float: Cosine similarity value.
+    """
+    # Normalize the embeddings to avoid division by zero
+    norm_a = np.linalg.norm(embedding_a)
+    norm_b = np.linalg.norm(embedding_b)
+    # Compute cosine similarity
+    cosine_sim = np.dot(embedding_a, embedding_b) / (norm_a * norm_b)
+    return cosine_sim
+def mean_cosine_similarity(embeddings_list, reference_embedding):
+    """
+    Calculate the mean cosine similarity of a list of embeddings to a reference embedding.
+    Args:
+    - embeddings_list (list of np.array): A list of embeddings.
+    - reference_embedding (np.array): The reference embedding to which the cosine similarity is calculated.
+    Returns:
+    - float: The mean cosine similarity value.
+    """
+    similarities = []
+    for embedding in embeddings_list:
+        # Calculate cosine similarity using numpy
+        sim = cosine_similarity_np(reference_embedding, embedding)
+        similarities.append(sim)
+    # Return the mean of the cosine similarities
+    return np.mean(similarities)
+def calculate_similarity(question: str, generated_questions: list[str]) -> float:
+    embeddings = OpenAIEmbeddings(model='text-embedding-3-large')
+    question_vec = np.asarray(embeddings.embed_query(question)).reshape(1, -1)
+    gen_question_vec = np.asarray(
+        embeddings.embed_documents(generated_questions)
+    ).reshape(len(generated_questions), -1)
+    norm = np.linalg.norm(gen_question_vec, axis=1) * np.linalg.norm(
+        question_vec, axis=1
+    )
+    return np.mean((np.dot(gen_question_vec, question_vec.T).reshape(-1,) / norm))
+def generate_questions(answer: str) -> Tuple[str, bool]:
+    template = """
+    Generate a question for the given answer and identify if answer is noncommittal.
+    Give noncommittal as True if the answer is noncommittal and False if the answer is committal.
+    A noncommittal answer is one that is evasive, vague, or ambiguous.
+    For example, "I don't know" or "I'm not sure" are noncommittal answers.
+    Answer:
+    {answer}
+    """
+    llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
+    prompt = ChatPromptTemplate.from_template(template)
+    tools = [VariantQuestionAnswerCommittal]
+    chain = (
+        prompt
+        | llm.bind_tools(tools)
+        | PydanticToolsParser(tools=tools)
+    )
+    res = chain.invoke({'answer': answer})[0]
+    question = res.question
+    noncommittal = res.noncommittal
+    return question, noncommittal
+def answer_relevancy(run: Run, example: Example) -> dict:
+    # Assumes your RAG app includes the prediction in the "output" key in its response
+    answer: str = run.outputs["answer"].content
+    o_question: str = example.inputs['question']
+    # Get generated question variants based on chain answer
+    questions, noncommittals = [], []
+    for _ in range(3):
+        question, noncommittal = generate_questions(answer)
+        if noncommittal:
+            return {"key": "Answer Relevancy", "score": 0}
+        questions.append(question)
+        noncommittals.append(noncommittal)
+    relevancy_score = calculate_similarity(o_question, questions)
+    return {"key": "Answer Relevancy", "score": relevancy_score}

policy_rag/metrics/_context_precision.py ADDED Viewed

	@@ -0,0 +1,75 @@

+from dotenv import load_dotenv
+load_dotenv()
+import json
+from typing import List, Tuple
+import numpy as np
+from langsmith.schemas import Example, Run
+from pydantic import BaseModel, Field
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_openai import ChatOpenAI
+from langchain_core.output_parsers import PydanticToolsParser
+class ContextPrecisionVerification(BaseModel):
+    """Answer for the verification task wether the context was useful."""
+    verdict: int = Field(..., description="Binary (0/1) verdict of verification")
+def verify_context_precision(
+        question: str,
+        answer: str,
+        context: str
+    ) -> int:
+    template = """
+    Given Question, Answer, and Context below, verify if the Context was useful in arriving at the given Answer.
+    Question:
+    {question}
+    Answer:
+    {answer}
+    Context:
+    {context}
+    """
+    llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
+    prompt = ChatPromptTemplate.from_template(template)
+    tools = [ContextPrecisionVerification]
+    chain = (
+        prompt
+        | llm.bind_tools(tools)
+        | PydanticToolsParser(tools=tools)
+    )
+    res = chain.invoke({'question': question, 'answer': answer, 'context': context})[0]
+    return res.verdict
+def context_precision(run: Run, example: Example) -> dict:
+    question: str = example.inputs['question']
+    ground_truth: str = example.outputs["ground_truth"]
+    contexts: List[str] = [context.page_content for context in run.outputs['contexts']]
+    # Verify if the context was relevant / useful to the generated answer.
+    verdicts = []
+    for context in contexts:
+        verdict = verify_context_precision(question, ground_truth, context)
+        verdicts.append(verdict)
+    # Calculate Precsions@k for each context chunk
+    precisions_at_k = []
+    for idx, verdict in enumerate(verdicts):
+        k = idx+1
+        precision_at_k = verdict/k
+        precisions_at_k.append(precision_at_k)
+    context_precision_score = sum(precisions_at_k) / (sum(verdicts) + 1e-10)
+    return {"key": "Context Precision", "score": context_precision_score}

policy_rag/metrics/_context_recall.py ADDED Viewed

	@@ -0,0 +1,95 @@

+from dotenv import load_dotenv
+load_dotenv()
+import json
+from typing import List
+from langsmith.schemas import Example, Run
+from pydantic import BaseModel, Field
+from langchain_core.documents.base import Document
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_openai import ChatOpenAI
+from langchain_core.output_parsers import PydanticToolsParser
+class Statements(BaseModel):
+    """Use to record each statement in the answer."""
+    statements: List[str] = Field(description="The statements found in the text.")
+class ContextRecallAttribution(BaseModel):
+    """Use to determine if a statement can be attributed to the context."""
+    attributed: int = Field(..., description="Binary (0/1) verdict of whether statement can be attributed to context.")
+def extract_statements(ground_truth: str) -> List[str]:
+    template = """
+    Extract all statements from the Text below. Record each statement as
+    a self-contained logical sentence that can be used to verify attribution
+    later.
+    Text:
+    {ground_truth}
+    """
+    llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
+    prompt = ChatPromptTemplate.from_template(template)
+    tools = [Statements]
+    chain = (
+        prompt
+        | llm.bind_tools(tools)
+        | PydanticToolsParser(tools=tools)
+    )
+    return chain.invoke({'ground_truth': ground_truth})[0].statements
+def get_statement_attribution(statement: str, formatted_docs: str) -> List[str]:
+    template = """
+    Given a Statement and a Context, classify if the Statement can be attributed
+    to the Context or not. Use only (1) or (0) as a binary classification.
+    Statement: {statement}
+    Context:
+    {formatted_docs}
+    """
+    llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
+    prompt = ChatPromptTemplate.from_template(template)
+    tools = [ContextRecallAttribution]
+    chain = (
+        prompt
+        | llm.bind_tools(tools)
+        | PydanticToolsParser(tools=tools)
+    )
+    res = chain.invoke({'statement': statement, 'formatted_docs': formatted_docs})
+    attributed = res[0].attributed
+    return attributed
+def context_recall(run: Run, example: Example) -> dict:
+    ground_truth: str = example.outputs["ground_truth"]
+    retrieved_docs: List[Document] = run.outputs["contexts"]
+    formatted_docs: str = "\n".join([doc.page_content for doc in retrieved_docs])
+    statements = extract_statements(ground_truth)
+    attributions = []
+    for statement in statements:
+        attribution = get_statement_attribution(statement, formatted_docs)
+        attributions.append(attribution)
+    context_recall_score = sum(attributions) / len(attributions) if attributions else None
+    return {"key": "Context Recall", "score": context_recall_score}

policy_rag/metrics/_faithfulness.py ADDED Viewed

	@@ -0,0 +1,98 @@

+from dotenv import load_dotenv
+load_dotenv()
+import json
+from typing import List
+from langsmith.schemas import Example, Run
+from pydantic import BaseModel, Field
+from langchain_core.documents.base import Document
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_openai import ChatOpenAI
+from langchain_core.output_parsers import PydanticToolsParser
+class Propositions(BaseModel):
+    """Use to record each factual assertion."""
+    propositions: List[str] = Field(description="The factual propositions generated by the model")
+class FaithfulnessScore(BaseModel):
+    """Use to score how faithful the propositions are to the docs."""
+    reasoning: str = Field(description="The reasoning for the faithfulness score")
+    score: bool
+def extract_propositions(text: str) -> List[str]:
+    template = """
+    Extract all factual statements from the following Text:
+    Text:
+    {text}
+    """
+    llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
+    prompt = ChatPromptTemplate.from_template(template)
+    tools = [Propositions]
+    chain = (
+        prompt
+        | llm.bind_tools(tools)
+        | PydanticToolsParser(tools=tools)
+    )
+    return chain.invoke({'text': text})[0].propositions
+def get_faithfulness_score(proposition: str, formatted_docs: str) -> List[str]:
+    template = """
+    Grade whether the Proposition can be logically concluded
+    from the Docs:
+    Proposition: {proposition}
+    Docs:
+    {formatted_docs}
+    """
+    llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
+    prompt = ChatPromptTemplate.from_template(template)
+    tools = [FaithfulnessScore]
+    chain = (
+        prompt
+        | llm.bind_tools(tools)
+        | PydanticToolsParser(tools=tools)
+    )
+    res = chain.invoke({'proposition': proposition, 'formatted_docs': formatted_docs})
+    score = res[0].score
+    reasoning = res[0].reasoning
+    return score, reasoning
+def faithfulness(run: Run, example: Example) -> dict:
+    # Assumes your RAG app includes the prediction in the "output" key in its response
+    response: str = run.outputs["answer"].content
+    # Assumes your RAG app includes the retrieved docs as a "context" key in the outputs
+    # If not, you can fetch from the child_runs of the run object
+    retrieved_docs: List[Document] = run.outputs["contexts"]
+    formatted_docs = "\n".join([doc.page_content for doc in retrieved_docs])
+    propositions = extract_propositions(response)
+    scores, reasoning = [], []
+    for proposition in propositions:
+        score, reason = get_faithfulness_score(proposition, formatted_docs)
+        scores.append(score)
+        reasoning.append(reason)
+    average_score = sum(scores) / len(scores) if scores else None
+    comment = "\n".join(reasoning)
+    return {"key": "faithfulness", "score": average_score, "comment": comment}

policy_rag/sdg_utils.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import os
+from ragas.testset.generator import TestsetGenerator
+from ragas.testset.generator import TestDataset
+from ragas.testset.evolutions import simple, reasoning, multi_context
+from langchain_openai import ChatOpenAI, OpenAIEmbeddings
+from langchain_core.documents.base import Document
+from typing import List
+from langsmith import Client
+from pandas import DataFrame
+import asyncio
+async def ragas_sdg(
+        context_docs: List[Document],
+        n_qa_pairs: int = 20,
+        embedding_model: OpenAIEmbeddings = OpenAIEmbeddings(model='text-embedding-3-large')
+    ) -> TestDataset:
+    generator_llm = ChatOpenAI(model="gpt-4o")
+    critic_llm = ChatOpenAI(model="gpt-4o-mini")
+    embeddings = embedding_model
+    generator = TestsetGenerator.from_langchain(
+        generator_llm,
+        critic_llm,
+        embeddings
+    )
+    distributions = {
+        simple: 0.5,
+        multi_context: 0.25,
+        reasoning: 0.25
+    }
+    test_set = generator.generate_with_langchain_docs(context_docs, n_qa_pairs, distributions)
+    return test_set
+def upload_dataset_langsmith(
+        dataset: TestDataset | DataFrame,
+        dataset_name: str,
+        description: str
+    ) -> None:
+    client = Client()
+    ls_dataset = client.create_dataset(
+        dataset_name=dataset_name, description=description
+    )
+    # TODO: implement a Pydantic model to validate input dataset
+    if type(dataset) == TestDataset:
+        dataset_df = dataset.to_pandas()
+    elif type(dataset) == DataFrame:
+        dataset_df = dataset
+    else:
+        raise TypeError('Dataset must be ragas TestDataset or pandas DataFrame')
+    for idx, row in dataset_df.iterrows():
+        client.create_example(
+            inputs={"question" : row["question"], "context": row["contexts"]},
+            outputs={"ground_truth" : row["ground_truth"]},
+            metadata={'metadata': row['metadata'][0], "evolution_type": row['evolution_type']},
+            dataset_id=ls_dataset.id
+        )

policy_rag/text_utils.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import os
+from typing import List
+from langchain_community.document_loaders import PyMuPDFLoader
+from langchain_core.documents.base import Document
+from policy_rag.data_models import DocList
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_experimental.text_splitter import SemanticChunker
+from langchain_openai.embeddings import OpenAIEmbeddings
+# Text Loading
+class DocLoader:
+    docs: DocList = DocList([]).root
+    def load(self, path: str) -> List[Document]:
+        if path.endswith('.pdf'):
+            loader = PyMuPDFLoader(path)
+            self.docs.extend(loader.load())
+        else:
+            print(f'Skipping {path} - not PDF')
+        return self.docs
+    def load_dir(self, dir_path: str) -> List[Document]:
+        for doc_name in os.listdir(dir_path):
+            doc_path = os.path.join(dir_path, doc_name)
+            self.load(doc_path)
+        return self.docs
+# Text Splitting
+def get_recursive_token_chunks(
+        docs: List[Document],
+        model_name: str = 'gpt-4',
+        chunk_size: int = 150,
+        chunk_overlap: int = 0
+) -> List[Document]:
+    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
+        model_name=model_name,
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap
+    )
+    return text_splitter.split_documents(docs)
+def get_semantic_chunks(
+        docs: List[Document],
+        embedding_model: OpenAIEmbeddings,
+        breakpoint_type: str = 'gradient'
+) -> List[Document]:
+    text_splitter = SemanticChunker(
+        embeddings=embedding_model,
+        breakpoint_threshold_type=breakpoint_type
+    )
+    return text_splitter.split_documents(docs)

policy_rag/vectorstore_utils.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import os
+from langchain_core.documents.base import Document
+from langchain_qdrant import QdrantVectorStore
+from langchain_community.vectorstores import Qdrant
+from langchain_core.vectorstores import VectorStoreRetriever
+from qdrant_client import QdrantClient
+from qdrant_client.http.models import Distance, VectorParams
+from langchain_openai import OpenAIEmbeddings
+from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
+from typing import Literal, Optional, List, Any
+from uuid import UUID
+class QdrantVectorstoreHelper:
+    def __init__(self) -> Any:
+        self.client = None
+        if os.getenv('QDRANT_API_KEY') and os.getenv('QDRANT_URL'):
+            self.client = QdrantClient(
+                url=os.getenv('QDRANT_URL'),
+                api_key=os.getenv('QDRANT_API_KEY')
+            )
+        else:
+            print("Qdrant API Key and URL not present.")
+    def create_collection(self, name: str, vector_size: int) -> None:
+        if self.client:
+            self.client.create_collection(
+                collection_name=name,
+                vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
+            )
+        else:
+            print('No Qdrant Client')
+    def create_local_vectorstore(
+            self,
+            chunks: List[Document],
+            embedding_model: OpenAIEmbeddings | HuggingFaceInferenceAPIEmbeddings = OpenAIEmbeddings(model='text-embedding-3-large'),
+            vector_size: int = 3072
+        ) -> None:
+        self.local_vectorstore = Qdrant.from_documents(
+            documents=chunks,
+            vector_params={'size': vector_size, 'distance': Distance.COSINE},
+            embedding=embedding_model,
+            batch_size=32 if type(embedding_model) == HuggingFaceInferenceAPIEmbeddings else 64,
+            location=":memory:"
+        )
+    def create_cloud_vectorstore(
+            self,
+            chunks: List[Document],
+            collection_name: str,
+            embedding_model: OpenAIEmbeddings | HuggingFaceInferenceAPIEmbeddings = OpenAIEmbeddings(model='text-embedding-3-large'),
+            vector_size: int = 3072
+        ) -> None:
+        try:
+            self.cloud_vectorstore = QdrantVectorStore.from_existing_collection(
+                embedding=embedding_model,
+                collection_name=collection_name,
+                url=os.getenv('QDRANT_URL'),
+                api_key=os.getenv('QDRANT_API_KEY')
+            )
+        except:
+            self.cloud_vectorstore = QdrantVectorStore.from_documents(
+                documents=chunks,
+                embedding=embedding_model,
+                vector_params={'size': vector_size, 'distance': Distance.COSINE},
+                collection_name=collection_name,
+                batch_size=4 if type(embedding_model) == HuggingFaceInferenceAPIEmbeddings else 64,
+                prefer_grpc=True,
+                url=os.getenv('QDRANT_URL'),
+                api_key=os.getenv('QDRANT_API_KEY')
+            )
+    def add_docs_to_vectorstore(
+            self,
+            collection_name: Literal['memory'] | str,
+            chunks: List[Document],
+            uuids: UUID
+        ) -> None:
+        str_uuids = [str(uuid) for uuid in uuids]
+        if collection_name == 'memory':
+            self.local_vectorstore.add_documents(documents=chunks, ids=str_uuids)
+        else:
+            self.cloud_vectorstore = QdrantVectorStore.from_existing_collection(
+                collection_name=collection_name,
+                url=os.getenv('QDRANT_URL'),
+                api_key=os.getenv('QDRANT_API_KEY')
+            )
+            self.cloud_vectorstore.add_documents(documents=chunks, ids=str_uuids)
+    def get_retriever(
+            self,
+            collection_name: Literal['memory'] | str,
+            k: int = 3,
+            embedding_model: OpenAIEmbeddings = OpenAIEmbeddings(model='text-embedding-3-large')
+        ) -> VectorStoreRetriever:
+        if collection_name == 'memory':
+            return self.local_vectorstore.as_retriever(search_kwargs={'k': k})
+        else:
+            self.cloud_vectorstore = QdrantVectorStore.from_existing_collection(
+                collection_name=collection_name,
+                embedding=embedding_model,
+                url=os.getenv('QDRANT_URL'),
+                api_key=os.getenv('QDRANT_API_KEY')
+            )
+            return self.cloud_vectorstore.as_retriever(search_kwargs={'k': k})