Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| import logging | |
| import pandas as pd | |
| import src.envs as envs | |
| from src.backend.model_operations import SummaryGenerator, EvaluationModel | |
| import src.backend.util as util | |
| logging.basicConfig(level=logging.INFO, | |
| format='%(asctime)s - %(levelname)s - %(message)s') | |
| class Evaluator: | |
| """A class to evaluate summaries generated by a language model. | |
| Attributes: | |
| model (str): The name or path of the model. | |
| revision (str): The model revision. | |
| precision (str): The precision setting of the model. | |
| num_fewshot (int): Number of few-shot examples to use. | |
| batch_size (int): Batch size for processing. | |
| device (str): The device to run the model on. | |
| no_cache (bool): Flag to disable caching. | |
| limit (int): Limit on the number of items to process. | |
| write_out (bool): Whether to write results to a file. | |
| output_base_path (str): Base path for output files. | |
| summary_generator (SummaryGenerator): Instance for generating summaries. | |
| eval_model (EvaluationModel): Instance for evaluating summaries. | |
| """ | |
| def __init__(self, model, revision, precision, batch_size, | |
| device, no_cache, limit, write_out=True, | |
| output_base_path='logs'): | |
| """Initializes the Evaluator with the given model and settings. | |
| Args: | |
| model (str): The name or path of the model. | |
| revision (str): The model revision. | |
| precision (str): The precision setting of the model. | |
| num_fewshot (int): Number of few-shot examples to use. | |
| batch_size (int): Batch size for processing. | |
| device (str): The device to run the model on. | |
| no_cache (bool): Flag to disable caching. | |
| limit (int): Limit on the number of items to process. | |
| write_out (bool): Whether to write results to a file. | |
| output_base_path (str): Base path for output files. | |
| """ | |
| self.model = model | |
| self.revision = revision | |
| self.precision = precision | |
| self.batch_size = batch_size | |
| self.device = device | |
| self.no_cache = no_cache | |
| self.limit = limit | |
| self.write_out = write_out | |
| self.output_base_path = output_base_path | |
| try: | |
| self.summary_generator = SummaryGenerator(model, revision) | |
| self.eval_model = EvaluationModel(envs.HEM_PATH) | |
| except Exception as e: | |
| logging.error(f"Error initializing Evaluator: {e}") | |
| raise | |
| def evaluate(self): | |
| """ | |
| Performs the evaluation process by generating summaries | |
| and computing metrics. | |
| Returns: | |
| dict: A dictionary containing evaluation results. | |
| """ | |
| try: | |
| df = pd.read_csv(envs.DATASET_PATH) | |
| generated_summaries_df = self.summary_generator.generate_summaries(df) | |
| avg_summary_len = self.summary_generator.avg_length | |
| answer_rate = self.summary_generator.answer_rate | |
| hallucination_scores = self.eval_model.evaluate_hallucination( | |
| generated_summaries_df) | |
| factual_consistency_rate = self.eval_model.compute_factual_consistency_rate() | |
| hallucination_rate = self.eval_model.hallucination_rate | |
| results = util.format_results(model_name=self.model, revision=self.revision, | |
| precision=self.precision, | |
| factual_consistency_rate=factual_consistency_rate, | |
| hallucination_rate=hallucination_rate, | |
| answer_rate=answer_rate, | |
| avg_summary_len=avg_summary_len) | |
| return results | |
| except FileNotFoundError: | |
| logging.error(f"File not found: {envs.DATASET_PATH}") | |
| raise | |
| except Exception as e: | |
| logging.error(f"Error during evaluation: {e}") | |
| raise | |