Spaces:
No application file
No application file
| import nltk | |
| from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoModelForCausalLM | |
| import torch | |
| from sentence_transformers import SentenceTransformer, util | |
| from bert_score import score | |
| from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction | |
| from rouge import Rouge | |
| from tqdm import tqdm | |
| from datasets import load_metric | |
| # Download necessary NLTK data | |
| nltk.download('punkt') | |
| nltk.download('stopwords') | |
| # --- Model and Metric Loading --- | |
| class ContentEvaluator: | |
| def __init__(self): | |
| self.semantic_similarity_model = SentenceTransformer('neuralmind/bert-large-portuguese-cased') | |
| self.perplexity_model_name = "unicamp-dl/ptt5-base-portuguese-vocab" | |
| self.perplexity_tokenizer = AutoTokenizer.from_pretrained(self.perplexity_model_name) | |
| # Load Hugging Face metrics | |
| self.bertscore_metric = load_metric("bertscore") | |
| self.bleu_metric = load_metric("bleu") | |
| self.rouge_metric = load_metric("rouge") | |
| self.meteor_metric = load_metric("meteor") | |
| self.sacrebleu_metric = load_metric("sacrebleu") # More robust BLEU implementation | |
| # Load a powerful LLM for judging content and detecting hallucinations | |
| self.judge_model_name = "gpt-3.5-turbo" # Gemini Or GPT-4 if available | |
| self.judge = pipeline("text-generation", model=self.judge_model_name) | |
| def calculate_perplexity(self, text): | |
| """ | |
| Calculates the perplexity of a text using a Portuguese LLM model. | |
| Perplexity measures how well the language model understands the text. | |
| Lower perplexity indicates that the text is more predictable and likely to be grammatically correct. | |
| Higher perplexity suggests the text is more surprising or unusual, potentially indicating errors or nonsensical content. | |
| """ | |
| try: | |
| perplexity_model = AutoModelForCausalLM.from_pretrained(self.perplexity_model_name) | |
| with torch.no_grad(): | |
| tokenize_input = self.perplexity_tokenizer.tokenize(text) | |
| tensor_input = self.perplexity_tokenizer.encode(text, return_tensors='pt') | |
| loss = perplexity_model(tensor_input, labels=tensor_input)[0] | |
| return torch.exp(loss).item() | |
| except Exception as e: | |
| print(f"Error calculating perplexity: {e}") | |
| return float('inf') | |
| def detect_hallucination_with_llm(self, text, window_size=200): | |
| """ | |
| Detects potential hallucinations using an LLM with a refined prompt. | |
| """ | |
| hallucinations = [] | |
| text_chunks = nltk.word_tokenize(text) | |
| for i in range(0, len(text_chunks), window_size): | |
| chunk = " ".join(text_chunks[i:i + window_size]) | |
| prompt = f""" | |
| You are an expert in identifying factual errors and inconsistencies in educational text. | |
| Your task is to meticulously analyze the provided text excerpt and pinpoint any potential hallucinations. | |
| Focus on identifying claims or statements that exhibit the following characteristics: | |
| * **Factual Inaccuracy:** Assertions that are demonstrably false or lack credible supporting evidence. | |
| * **Logical Fallacies:** Statements containing flawed reasoning or internal contradictions. | |
| * **Nonsensical Claims:** Assertions that are absurd, meaningless, or defy common sense. | |
| * **Invented Information:** Fabricated details or events that have no basis in reality. | |
| Text Excerpt: | |
| ``` | |
| {chunk} | |
| ``` | |
| For each potential hallucination, provide: | |
| - **Hallucination:** The specific text you believe is a hallucination. | |
| - **Explanation:** A detailed and precise justification for why you classify it as a hallucination. | |
| Return your analysis as a JSON list of dictionaries, strictly adhering to the following format: | |
| ```json | |
| [ | |
| {{"hallucination": "[The hallucinated text]", "explanation": "[Your detailed explanation]"}} | |
| ] | |
| ``` | |
| """ | |
| response = self.judge(prompt, max_length=300)[0]['generated_text'].strip() | |
| try: | |
| chunk_hallucinations = eval(response) | |
| for hallucination in chunk_hallucinations: | |
| hallucinations.append({ | |
| 'chunk': chunk, | |
| 'hallucination': hallucination['hallucination'], | |
| 'explanation': hallucination['explanation'] | |
| }) | |
| except Exception as e: | |
| print(f"Error parsing LLM response: {e}") | |
| print(f"LLM Response: {response}") | |
| return hallucinations | |
| def calculate_metrics(self, generated_text, reference_text): | |
| """Calculates BERTScore, BLEU, ROUGE, METEOR, and SacreBLEU metrics.""" | |
| results = {} | |
| try: | |
| results['bertscore'] = self.bertscore_metric.compute(predictions=[generated_text], references=[reference_text], lang="pt")['f1'][0] | |
| bleu_results = self.bleu_metric.compute(predictions=[generated_text.split()], references=[[reference_text.split()]]) | |
| results['bleu'] = bleu_results['bleu'] | |
| rouge_results = self.rouge_metric.compute(predictions=[generated_text], references=[reference_text]) | |
| results['rougeL'] = rouge_results['rougeL'] | |
| meteor_results = self.meteor_metric.compute(predictions=[generated_text], references=[reference_text]) | |
| results['meteor'] = meteor_results['meteor'] | |
| # SacreBLEU (more robust BLEU implementation) | |
| sacrebleu_results = self.sacrebleu_metric.compute(predictions=[generated_text], references=[[reference_text]]) | |
| results['sacrebleu'] = sacrebleu_results['score'] | |
| except Exception as e: | |
| print(f"Error calculating metrics: {e}") | |
| results = {'bertscore': None, 'bleu': None, 'rougeL': None, 'meteor': None, 'sacrebleu': None} | |
| return results | |
| def analyze_text(self, text, perplexity_threshold=40): | |
| """ | |
| Analyzes a text for perplexity and potential hallucinations. | |
| """ | |
| results = [] | |
| sentences = nltk.sent_tokenize(text) | |
| for i, sentence in enumerate(sentences): | |
| perplexity = self.calculate_perplexity(sentence) | |
| hallucinations = self.detect_hallucination_with_llm(sentence) | |
| issues = [] | |
| if perplexity > perplexity_threshold: | |
| issues.append(f"- **High Perplexity:** ({perplexity:.2f}) The sentence might be grammatically incorrect or nonsensical.") | |
| if hallucinations: | |
| for hallucination in hallucinations: | |
| issues.append(f"- **Potential Hallucination (LLM):** {hallucination['hallucination']} - {hallucination['explanation']}") | |
| review_flag = len(issues) > 0 | |
| explanation = "\n".join(issues) if issues else "No potential issues detected." | |
| results.append({ | |
| 'sentence_index': i, | |
| 'review_flag': review_flag, | |
| 'explanation': explanation, | |
| 'perplexity': perplexity, | |
| 'hallucinations': hallucinations, | |
| 'sentence': sentence | |
| }) | |
| return results | |
| def analyze_content_for_review(self, generated_text, reference_text, | |
| similarity_threshold, | |
| bertscore_threshold, | |
| bleu_threshold, | |
| rouge_threshold, | |
| meteor_threshold): | |
| """Analyzes content and flags potential issues based on provided thresholds and LLM judgment.""" | |
| similarity = self.estimate_semantic_similarity(generated_text, reference_text) | |
| metrics = self.calculate_metrics(generated_text, reference_text) | |
| llm_judgment = self.get_llm_judgment(generated_text, reference_text) | |
| issues = [] | |
| if similarity < similarity_threshold: | |
| issues.append(f"- **Low Semantic Similarity:** ({similarity:.2f}) Content might be off-topic or not factually aligned.") | |
| if metrics['bertscore'] and metrics['bertscore'] < bertscore_threshold: | |
| issues.append(f"- **Low BERTScore:** ({metrics['bertscore']:.2f}) There might be factual inaccuracies or significant paraphrasing.") | |
| if metrics['bleu'] and metrics['bleu'] < bleu_threshold: | |
| issues.append(f"- **Low BLEU Score:** ({metrics['bleu']:.2f}) The generated text might not be fluent or use appropriate wording.") | |
| if metrics['rougeL'] and metrics['rougeL'] < rouge_threshold: | |
| issues.append(f"- **Low ROUGE-L Score:** ({metrics['rougeL']:.2f}) The generated text might not cover important information from the reference.") | |
| if metrics['meteor'] and metrics['meteor'] < meteor_threshold: | |
| issues.append(f"- **Low METEOR Score:** ({metrics['meteor']:.2f}) The generated text might have poor word alignment with the reference.") | |
| # Use LLM judgment as the primary decision-maker | |
| if llm_judgment == "major issues": | |
| review_flag = True | |
| explanation = f"LLM Judgment: **Major Issues**\n" + "\n".join(issues) | |
| elif llm_judgment == "minor issues": | |
| review_flag = True | |
| explanation = f"LLM Judgment: **Minor Issues**\n" + "\n".join(issues) | |
| else: | |
| review_flag = False | |
| explanation = "LLM Judgment: **No Issues**" | |
| return { | |
| 'review_flag': review_flag, | |
| 'explanation': explanation, | |
| 'semantic_similarity': similarity, | |
| 'metrics': metrics, | |
| 'llm_judgment': llm_judgment, | |
| 'generated_text': generated_text, | |
| 'reference_text': reference_text | |
| } | |
| # --- Example Usage --- | |
| if __name__ == "__main__": | |
| evaluator = ContentEvaluator() | |
| # Example text (replace with your actual data) | |
| text = """ | |
| A Terra é plana e o Sol gira em torno dela. | |
| A gravidade é uma força fraca. | |
| As plantas precisam de água para sobreviver. | |
| A Lua é feita de queijo. | |
| Os dinossauros ainda vivem na Amazônia. | |
| """ | |
| analysis_results = evaluator.analyze_text(text) | |
| for result in analysis_results: | |
| print(f"----- Sentence {result['sentence_index'] + 1} -----") | |
| print(f"Review Flag: {result['review_flag']}") | |
| print(f"Explanation: {result['explanation']}") | |
| print(f"Perplexity: {result['perplexity']:.2f}") | |
| print(f"Sentence: {result['sentence']}\n") | |
| # 2. Content Evaluation Phase (using the best thresholds) | |
| new_generated_text = evaluator.generate_educational_content("Matemática") | |
| new_reference_text = "Content from your educational material..." | |
| evaluation_result = evaluator.analyze_content_for_review( | |
| new_generated_text, new_reference_text, | |
| best_thresholds['similarity_threshold'], | |
| best_thresholds['bertscore_threshold'], | |
| best_thresholds['bleu_threshold'], | |
| best_thresholds['rouge_threshold'], | |
| best_thresholds['meteor_threshold'] | |
| ) | |
| print("\n----- Evaluation Result -----") | |
| print(f"Review Flag: {evaluation_result['review_flag']}") | |
| print(f"Explanation: {evaluation_result['explanation']}") |