File size: 6,498 Bytes
c1ff2ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import os
import logging
import json
import gradio as gr
import pandas as pd
from datasets import load_dataset
import random
from openai import OpenAI
from typing import List, Tuple, Dict
from dotenv import load_dotenv
from transformers import pipeline
import asyncio

# Import the required functions from the pipeline file
from pipeline_gradio_experimental import generate_basic_question, rank_questions_with_details

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

# Initialize OpenAI client
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# Load the SQuAD dataset
dataset = load_dataset("squad")

# Initialize the question answering pipeline
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

def get_random_entry():
    random_index = random.randint(0, len(dataset['train']) - 1)
    entry = dataset['train'][random_index]
    return entry['context'], entry['answers']['text'][0], entry['question']

def generate_answer(context: str, question: str) -> str:
    try:
        result = qa_pipeline(question=question, context=context)
        return result['answer']
    except Exception as e:
        logger.error(f"Error in generate_answer: {e}")
        return "Failed to generate answer"

def compare_questions(context: str, original_answer: str, question1: str, answer1: str, question2: str, answer2: str) -> Dict[str, any]:
    try:
        response = client.chat.completions.create(
            model="gpt-4o-2024-08-06",
            messages=[
                {"role": "system", "content": "You are an expert in evaluating question-answer pairs based on a given context."},
                {"role": "user", "content": f"""Compare the following two question-answer pairs based on the given context and original answer. Evaluate their quality and relevance.

Context: {context}
Original Answer: {original_answer}

Question 1: {question1}
Answer 1: {answer1}

Question 2: {question2}
Answer 2: {answer2}

Score each question-answer pair on a scale of 0 to 10 based on the quality and relevance of the question and answer. Provide an explanation for your evaluation. Focus on how well the new answer matches the old answer considering the context. Make sure to grade one higher than the other."""}
            ],
            response_format={
                "type": "json_schema",
                "json_schema": {
                    "name": "question_comparison_evaluator",
                    "strict": True,
                    "schema": {
                        "type": "object",
                        "properties": {
                            "question1_score": {"type": "number"},
                            "question2_score": {"type": "number"},
                            "explanation": {"type": "string"}
                        },
                        "required": ["question1_score", "question2_score", "explanation"],
                        "additionalProperties": False
                    }
                }
            }
        )
        return json.loads(response.choices[0].message.content)
    except Exception as e:
        logger.error(f"Error in comparing questions: {e}")
        return {"question1_score": 0, "question2_score": 0, "explanation": "Failed to compare questions"}

async def process_random_entry(progress=gr.Progress()):
    context, original_answer, original_question = get_random_entry()
    
    # Yield the original context, question, and answer immediately
    yield context, original_question, original_answer, gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
    
    # Simulate some processing time
    await asyncio.sleep(1)
    progress(0.3, desc="Generating questions...")
    
    basic_question = generate_basic_question(context, original_answer)
    _, _, enhanced_question = rank_questions_with_details(context, original_answer)
    
    await asyncio.sleep(1)
    progress(0.6, desc="Generating answers...")
    
    basic_answer = generate_answer(context, basic_question)
    enhanced_answer = generate_answer(context, enhanced_question)
    
    await asyncio.sleep(1)
    progress(0.9, desc="Comparing questions...")
    
    comparison_result = compare_questions(context, original_answer, basic_question, basic_answer, enhanced_question, enhanced_answer)
    
    winner = "Basic" if comparison_result["question1_score"] > comparison_result["question2_score"] else "Enhanced"
    
    # Yield the final results
    yield (
        context,
        original_question,
        original_answer,
        gr.update(visible=True),
        gr.update(visible=True, value=f"Question: {basic_question}\nAnswer: {basic_answer}"),
        gr.update(visible=True, value=f"Question: {enhanced_question}\nAnswer: {enhanced_answer}"),
        gr.update(visible=True, value=f"Question 1 Score: {comparison_result['question1_score']}\n"
                                      f"Question 2 Score: {comparison_result['question2_score']}\n"
                                      f"Explanation: {comparison_result['explanation']}\n"
                                      f"Winner: {winner} Generation")
    )

# Create Gradio interface
with gr.Blocks(theme=gr.themes.Default()) as iface:
    gr.Markdown("# Question Generation and Comparison")
    gr.Markdown("Click the button to get a random entry from the SQuAD dataset and compare basic and enhanced question generation.")
    
    random_button = gr.Button("Get Random Question")
    
    with gr.Column(visible=False) as output_column:
        context_output = gr.Textbox(label="Original Context")
        original_question_output = gr.Textbox(label="Original Question")
        original_answer_output = gr.Textbox(label="Original Answer")
        basic_generation_output = gr.Textbox(label="Basic Generation", visible=False)
        enhanced_generation_output = gr.Textbox(label="Enhanced Generation", visible=False)
        comparison_result_output = gr.Textbox(label="Comparison Result", visible=False)

    random_button.click(
        fn=process_random_entry,
        outputs=[
            context_output,
            original_question_output,
            original_answer_output,
            output_column,
            basic_generation_output,
            enhanced_generation_output,
            comparison_result_output
        ]
    )

# Launch the app
if __name__ == "__main__":
    iface.launch()