Spaces:
Running
Running
| """ | |
| import pandas as pd | |
| import numpy as np | |
| from tqdm import tqdm | |
| import json | |
| import requests | |
| import sqlite3 | |
| from minsearch import Index | |
| # Database connection | |
| conn = sqlite3.connect('data/sqlite.db') | |
| cursor = conn.cursor() | |
| # Load ground truth data from CSV | |
| def load_ground_truth(): | |
| return pd.read_csv('data/ground-truth-retrieval.csv') | |
| ground_truth = load_ground_truth() | |
| # Load transcript data | |
| def load_transcripts(): | |
| cursor.execute("SELECT * FROM transcript_segments") | |
| rows = cursor.fetchall() | |
| return pd.DataFrame(rows, columns=['segment_id', 'video_id', 'content', 'start_time', 'duration']) | |
| transcripts = load_transcripts() | |
| # Create index | |
| index = Index( | |
| text_fields=['content'], | |
| keyword_fields=['video_id', 'segment_id'] | |
| ) | |
| index.fit(transcripts.to_dict('records')) | |
| # RAG flow | |
| def search(query): | |
| boost = {} | |
| results = index.search( | |
| query=query, | |
| filter_dict={}, | |
| boost_dict=boost, | |
| num_results=10 | |
| ) | |
| return results | |
| prompt_template = ''' | |
| You're an AI assistant for YouTube video transcripts. Answer the QUESTION based on the CONTEXT from our transcript database. | |
| Use only the facts from the CONTEXT when answering the QUESTION. | |
| QUESTION: {question} | |
| CONTEXT: | |
| {context} | |
| '''.strip() | |
| def build_prompt(query, search_results): | |
| context = "\n\n".join([f"Segment {i+1}: {result['content']}" for i, result in enumerate(search_results)]) | |
| prompt = prompt_template.format(question=query, context=context).strip() | |
| return prompt | |
| def llm(prompt): | |
| response = requests.post('http://localhost:11434/api/generate', json={ | |
| 'model': 'phi', | |
| 'prompt': prompt | |
| }) | |
| if response.status_code == 200: | |
| return response.json()['response'] | |
| else: | |
| print(f"Error: {response.status_code} - {response.text}") | |
| return None | |
| def rag(query): | |
| search_results = search(query) | |
| prompt = build_prompt(query, search_results) | |
| answer = llm(prompt) | |
| return answer | |
| # Evaluation metrics | |
| def hit_rate(relevance_total): | |
| return sum(any(line) for line in relevance_total) / len(relevance_total) | |
| def mrr(relevance_total): | |
| scores = [] | |
| for line in relevance_total: | |
| for rank, relevant in enumerate(line, 1): | |
| if relevant: | |
| scores.append(1 / rank) | |
| break | |
| else: | |
| scores.append(0) | |
| return sum(scores) / len(scores) | |
| def evaluate(ground_truth, search_function): | |
| relevance_total = [] | |
| for _, row in tqdm(ground_truth.iterrows(), total=len(ground_truth)): | |
| video_id = row['video_id'] | |
| results = search_function(row['question']) | |
| relevance = [d['video_id'] == video_id for d in results] | |
| relevance_total.append(relevance) | |
| return { | |
| 'hit_rate': hit_rate(relevance_total), | |
| 'mrr': mrr(relevance_total), | |
| } | |
| # Parameter optimization | |
| param_ranges = { | |
| 'content': (0.0, 3.0), | |
| } | |
| def simple_optimize(param_ranges, objective_function, n_iterations=10): | |
| best_params = None | |
| best_score = float('-inf') | |
| for _ in range(n_iterations): | |
| current_params = {param: np.random.uniform(min_val, max_val) | |
| for param, (min_val, max_val) in param_ranges.items()} | |
| current_score = objective_function(current_params) | |
| if current_score > best_score: | |
| best_score = current_score | |
| best_params = current_params | |
| return best_params, best_score | |
| def objective(boost_params): | |
| def search_function(q): | |
| return search(q, boost_params) | |
| results = evaluate(ground_truth, search_function) | |
| return results['mrr'] | |
| # RAG evaluation | |
| prompt2_template = ''' | |
| You are an expert evaluator for a Youtube transcript assistant. | |
| Your task is to analyze the relevance of the generated answer to the given question. | |
| Based on the relevance of the generated answer, you will classify it | |
| as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT". | |
| Here is the data for evaluation: | |
| Question: {question} | |
| Generated Answer: {answer_llm} | |
| Please analyze the content and context of the generated answer in relation to the question | |
| and provide your evaluation in parsable JSON without using code blocks: | |
| {{ | |
| "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT", | |
| "Explanation": "[Provide a brief explanation for your evaluation]" | |
| }} | |
| '''.strip() | |
| def evaluate_rag(sample_size=200): | |
| sample = ground_truth.sample(n=sample_size, random_state=1) | |
| evaluations = [] | |
| for _, row in tqdm(sample.iterrows(), total=len(sample)): | |
| question = row['question'] | |
| answer_llm = rag(question) | |
| prompt = prompt2_template.format(question=question, answer_llm=answer_llm) | |
| evaluation = llm(prompt) | |
| evaluation = json.loads(evaluation) | |
| evaluations.append((row['video_id'], question, answer_llm, evaluation['Relevance'], evaluation['Explanation'])) | |
| return evaluations | |
| # Main execution | |
| if __name__ == "__main__": | |
| print("Evaluating search performance...") | |
| search_performance = evaluate(ground_truth, lambda q: search(q['question'])) | |
| print(f"Search performance: {search_performance}") | |
| print("\nOptimizing search parameters...") | |
| best_params, best_score = simple_optimize(param_ranges, objective, n_iterations=20) | |
| print(f"Best parameters: {best_params}") | |
| print(f"Best score: {best_score}") | |
| print("\nEvaluating RAG performance...") | |
| rag_evaluations = evaluate_rag(sample_size=200) | |
| # Store RAG evaluations in the database | |
| cursor.execute(''' | |
| CREATE TABLE IF NOT EXISTS rag_evaluations ( | |
| video_id TEXT, | |
| question TEXT, | |
| answer TEXT, | |
| relevance TEXT, | |
| explanation TEXT | |
| ) | |
| ''') | |
| cursor.executemany(''' | |
| INSERT INTO rag_evaluations (video_id, question, answer, relevance, explanation) | |
| VALUES (?, ?, ?, ?, ?) | |
| ''', rag_evaluations) | |
| conn.commit() | |
| print("Evaluation complete. Results stored in the database.") | |
| # Close the database connection | |
| conn.close() | |
| """ |