Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -361,14 +361,8 @@ import mauve
|
|
361 |
from sacrebleu import corpus_bleu
|
362 |
from rouge_score import rouge_scorer
|
363 |
from bert_score import score
|
364 |
-
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline
|
365 |
-
import
|
366 |
-
from nltk.util import ngrams
|
367 |
-
from nltk.tokenize import word_tokenize
|
368 |
-
from nltk.translate.meteor_score import meteor_score
|
369 |
-
from nltk.translate.chrf_score import sentence_chrf
|
370 |
-
from textstat import flesch_reading_ease, flesch_kincaid_grade
|
371 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
372 |
from mauve import compute_mauve
|
373 |
import os
|
374 |
import gradio as gr
|
@@ -406,11 +400,9 @@ class RAGEvaluator:
|
|
406 |
def __init__(self):
|
407 |
self.gpt2_model, self.gpt2_tokenizer = self.load_gpt2_model()
|
408 |
self.bias_pipeline = pipeline("zero-shot-classification", model="Hate-speech-CNERG/dehatebert-mono-english")
|
409 |
-
#
|
410 |
-
|
411 |
-
|
412 |
-
nltk.download('omw-1.4', quiet=True)
|
413 |
-
|
414 |
def load_gpt2_model(self):
|
415 |
model = GPT2LMHeadModel.from_pretrained('gpt2')
|
416 |
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
@@ -449,8 +441,17 @@ class RAGEvaluator:
|
|
449 |
return ppl.item()
|
450 |
|
451 |
def evaluate_diversity(self, texts):
|
452 |
-
|
453 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
454 |
diversity_score = len(unique_bigrams) / len(all_tokens) if all_tokens else 0
|
455 |
return diversity_score
|
456 |
|
@@ -460,19 +461,79 @@ class RAGEvaluator:
|
|
460 |
return bias_score
|
461 |
|
462 |
def evaluate_meteor(self, candidates, references):
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
468 |
|
469 |
def evaluate_chrf(self, candidates, references):
|
470 |
-
|
471 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
472 |
|
473 |
def evaluate_readability(self, text):
|
474 |
-
|
475 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
476 |
return flesch_ease, flesch_grade
|
477 |
|
478 |
def evaluate_mauve(self, reference_texts, generated_texts):
|
|
|
361 |
from sacrebleu import corpus_bleu
|
362 |
from rouge_score import rouge_scorer
|
363 |
from bert_score import score
|
364 |
+
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline, AutoTokenizer
|
365 |
+
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
366 |
from mauve import compute_mauve
|
367 |
import os
|
368 |
import gradio as gr
|
|
|
400 |
def __init__(self):
|
401 |
self.gpt2_model, self.gpt2_tokenizer = self.load_gpt2_model()
|
402 |
self.bias_pipeline = pipeline("zero-shot-classification", model="Hate-speech-CNERG/dehatebert-mono-english")
|
403 |
+
# Initialize tokenizer for text processing
|
404 |
+
self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
405 |
+
|
|
|
|
|
406 |
def load_gpt2_model(self):
|
407 |
model = GPT2LMHeadModel.from_pretrained('gpt2')
|
408 |
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
|
|
441 |
return ppl.item()
|
442 |
|
443 |
def evaluate_diversity(self, texts):
|
444 |
+
# Use Hugging Face tokenizer instead of NLTK
|
445 |
+
all_tokens = []
|
446 |
+
for text in texts:
|
447 |
+
tokens = self.tokenizer.tokenize(text)
|
448 |
+
all_tokens.extend(tokens)
|
449 |
+
|
450 |
+
# Create bigrams manually
|
451 |
+
unique_bigrams = set()
|
452 |
+
for i in range(len(all_tokens) - 1):
|
453 |
+
unique_bigrams.add((all_tokens[i], all_tokens[i+1]))
|
454 |
+
|
455 |
diversity_score = len(unique_bigrams) / len(all_tokens) if all_tokens else 0
|
456 |
return diversity_score
|
457 |
|
|
|
461 |
return bias_score
|
462 |
|
463 |
def evaluate_meteor(self, candidates, references):
|
464 |
+
# Simple approximation of METEOR without NLTK
|
465 |
+
# This is a simplified version - consider using an external API for full METEOR
|
466 |
+
meteor_scores = []
|
467 |
+
for ref, cand in zip(references, candidates):
|
468 |
+
ref_tokens = self.tokenizer.tokenize(ref)
|
469 |
+
cand_tokens = self.tokenizer.tokenize(cand)
|
470 |
+
|
471 |
+
# Calculate precision and recall
|
472 |
+
common_tokens = set(ref_tokens) & set(cand_tokens)
|
473 |
+
precision = len(common_tokens) / len(cand_tokens) if cand_tokens else 0
|
474 |
+
recall = len(common_tokens) / len(ref_tokens) if ref_tokens else 0
|
475 |
+
|
476 |
+
# F-measure with alpha=0.9 (METEOR default)
|
477 |
+
if precision + recall == 0:
|
478 |
+
f_score = 0
|
479 |
+
else:
|
480 |
+
f_score = (10 * precision * recall) / (9 * precision + recall)
|
481 |
+
|
482 |
+
meteor_scores.append(f_score)
|
483 |
+
|
484 |
+
return sum(meteor_scores) / len(meteor_scores) if meteor_scores else 0
|
485 |
|
486 |
def evaluate_chrf(self, candidates, references):
|
487 |
+
# Simple character n-gram F-score approximation
|
488 |
+
chrf_scores = []
|
489 |
+
for ref, cand in zip(references, candidates):
|
490 |
+
# Character 6-grams
|
491 |
+
ref_chars = list(ref)
|
492 |
+
cand_chars = list(cand)
|
493 |
+
|
494 |
+
ref_ngrams = set()
|
495 |
+
cand_ngrams = set()
|
496 |
+
|
497 |
+
# Create character 6-grams
|
498 |
+
for i in range(len(ref_chars) - 5):
|
499 |
+
ref_ngrams.add(tuple(ref_chars[i:i+6]))
|
500 |
+
|
501 |
+
for i in range(len(cand_chars) - 5):
|
502 |
+
cand_ngrams.add(tuple(cand_chars[i:i+6]))
|
503 |
+
|
504 |
+
common_ngrams = ref_ngrams & cand_ngrams
|
505 |
+
precision = len(common_ngrams) / len(cand_ngrams) if cand_ngrams else 0
|
506 |
+
recall = len(common_ngrams) / len(ref_ngrams) if ref_ngrams else 0
|
507 |
+
|
508 |
+
if precision + recall == 0:
|
509 |
+
chrf_score = 0
|
510 |
+
else:
|
511 |
+
chrf_score = 2 * precision * recall / (precision + recall)
|
512 |
+
|
513 |
+
chrf_scores.append(chrf_score)
|
514 |
+
|
515 |
+
return sum(chrf_scores) / len(chrf_scores) if chrf_scores else 0
|
516 |
|
517 |
def evaluate_readability(self, text):
|
518 |
+
# Simple readability metrics without textstat
|
519 |
+
words = re.findall(r'\b\w+\b', text.lower())
|
520 |
+
sentences = re.split(r'[.!?]+', text)
|
521 |
+
|
522 |
+
num_words = len(words)
|
523 |
+
num_sentences = len([s for s in sentences if s.strip()])
|
524 |
+
|
525 |
+
# Average word length
|
526 |
+
avg_word_length = sum(len(word) for word in words) / num_words if num_words else 0
|
527 |
+
|
528 |
+
# Words per sentence
|
529 |
+
words_per_sentence = num_words / num_sentences if num_sentences else 0
|
530 |
+
|
531 |
+
# Simplified Flesch Reading Ease approximation
|
532 |
+
flesch_ease = 206.835 - (1.015 * words_per_sentence) - (84.6 * avg_word_length)
|
533 |
+
|
534 |
+
# Simplified Flesch-Kincaid Grade Level approximation
|
535 |
+
flesch_grade = (0.39 * words_per_sentence) + (11.8 * avg_word_length) - 15.59
|
536 |
+
|
537 |
return flesch_ease, flesch_grade
|
538 |
|
539 |
def evaluate_mauve(self, reference_texts, generated_texts):
|