Prathamesh1420 commited on
Commit
ddfa055
·
verified ·
1 Parent(s): cd479e2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -24
app.py CHANGED
@@ -361,14 +361,8 @@ import mauve
361
  from sacrebleu import corpus_bleu
362
  from rouge_score import rouge_scorer
363
  from bert_score import score
364
- from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline
365
- import nltk
366
- from nltk.util import ngrams
367
- from nltk.tokenize import word_tokenize
368
- from nltk.translate.meteor_score import meteor_score
369
- from nltk.translate.chrf_score import sentence_chrf
370
- from textstat import flesch_reading_ease, flesch_kincaid_grade
371
- from sklearn.metrics.pairwise import cosine_similarity
372
  from mauve import compute_mauve
373
  import os
374
  import gradio as gr
@@ -406,11 +400,9 @@ class RAGEvaluator:
406
  def __init__(self):
407
  self.gpt2_model, self.gpt2_tokenizer = self.load_gpt2_model()
408
  self.bias_pipeline = pipeline("zero-shot-classification", model="Hate-speech-CNERG/dehatebert-mono-english")
409
- # Download required NLTK data
410
- nltk.download('punkt', quiet=True)
411
- nltk.download('wordnet', quiet=True)
412
- nltk.download('omw-1.4', quiet=True)
413
-
414
  def load_gpt2_model(self):
415
  model = GPT2LMHeadModel.from_pretrained('gpt2')
416
  tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
@@ -449,8 +441,17 @@ class RAGEvaluator:
449
  return ppl.item()
450
 
451
  def evaluate_diversity(self, texts):
452
- all_tokens = [tok for text in texts for tok in text.split()]
453
- unique_bigrams = set(ngrams(all_tokens, 2))
 
 
 
 
 
 
 
 
 
454
  diversity_score = len(unique_bigrams) / len(all_tokens) if all_tokens else 0
455
  return diversity_score
456
 
@@ -460,19 +461,79 @@ class RAGEvaluator:
460
  return bias_score
461
 
462
  def evaluate_meteor(self, candidates, references):
463
- meteor_scores = [
464
- meteor_score([word_tokenize(ref)], word_tokenize(cand))
465
- for ref, cand in zip(references, candidates)
466
- ]
467
- return sum(meteor_scores) / len(meteor_scores)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
468
 
469
  def evaluate_chrf(self, candidates, references):
470
- chrf_scores = [sentence_chrf(ref, cand) for ref, cand in zip(references, candidates)]
471
- return sum(chrf_scores) / len(chrf_scores)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
472
 
473
  def evaluate_readability(self, text):
474
- flesch_ease = flesch_reading_ease(text)
475
- flesch_grade = flesch_kincaid_grade(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
476
  return flesch_ease, flesch_grade
477
 
478
  def evaluate_mauve(self, reference_texts, generated_texts):
 
361
  from sacrebleu import corpus_bleu
362
  from rouge_score import rouge_scorer
363
  from bert_score import score
364
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline, AutoTokenizer
365
+ import re
 
 
 
 
 
 
366
  from mauve import compute_mauve
367
  import os
368
  import gradio as gr
 
400
  def __init__(self):
401
  self.gpt2_model, self.gpt2_tokenizer = self.load_gpt2_model()
402
  self.bias_pipeline = pipeline("zero-shot-classification", model="Hate-speech-CNERG/dehatebert-mono-english")
403
+ # Initialize tokenizer for text processing
404
+ self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
405
+
 
 
406
  def load_gpt2_model(self):
407
  model = GPT2LMHeadModel.from_pretrained('gpt2')
408
  tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
 
441
  return ppl.item()
442
 
443
  def evaluate_diversity(self, texts):
444
+ # Use Hugging Face tokenizer instead of NLTK
445
+ all_tokens = []
446
+ for text in texts:
447
+ tokens = self.tokenizer.tokenize(text)
448
+ all_tokens.extend(tokens)
449
+
450
+ # Create bigrams manually
451
+ unique_bigrams = set()
452
+ for i in range(len(all_tokens) - 1):
453
+ unique_bigrams.add((all_tokens[i], all_tokens[i+1]))
454
+
455
  diversity_score = len(unique_bigrams) / len(all_tokens) if all_tokens else 0
456
  return diversity_score
457
 
 
461
  return bias_score
462
 
463
  def evaluate_meteor(self, candidates, references):
464
+ # Simple approximation of METEOR without NLTK
465
+ # This is a simplified version - consider using an external API for full METEOR
466
+ meteor_scores = []
467
+ for ref, cand in zip(references, candidates):
468
+ ref_tokens = self.tokenizer.tokenize(ref)
469
+ cand_tokens = self.tokenizer.tokenize(cand)
470
+
471
+ # Calculate precision and recall
472
+ common_tokens = set(ref_tokens) & set(cand_tokens)
473
+ precision = len(common_tokens) / len(cand_tokens) if cand_tokens else 0
474
+ recall = len(common_tokens) / len(ref_tokens) if ref_tokens else 0
475
+
476
+ # F-measure with alpha=0.9 (METEOR default)
477
+ if precision + recall == 0:
478
+ f_score = 0
479
+ else:
480
+ f_score = (10 * precision * recall) / (9 * precision + recall)
481
+
482
+ meteor_scores.append(f_score)
483
+
484
+ return sum(meteor_scores) / len(meteor_scores) if meteor_scores else 0
485
 
486
  def evaluate_chrf(self, candidates, references):
487
+ # Simple character n-gram F-score approximation
488
+ chrf_scores = []
489
+ for ref, cand in zip(references, candidates):
490
+ # Character 6-grams
491
+ ref_chars = list(ref)
492
+ cand_chars = list(cand)
493
+
494
+ ref_ngrams = set()
495
+ cand_ngrams = set()
496
+
497
+ # Create character 6-grams
498
+ for i in range(len(ref_chars) - 5):
499
+ ref_ngrams.add(tuple(ref_chars[i:i+6]))
500
+
501
+ for i in range(len(cand_chars) - 5):
502
+ cand_ngrams.add(tuple(cand_chars[i:i+6]))
503
+
504
+ common_ngrams = ref_ngrams & cand_ngrams
505
+ precision = len(common_ngrams) / len(cand_ngrams) if cand_ngrams else 0
506
+ recall = len(common_ngrams) / len(ref_ngrams) if ref_ngrams else 0
507
+
508
+ if precision + recall == 0:
509
+ chrf_score = 0
510
+ else:
511
+ chrf_score = 2 * precision * recall / (precision + recall)
512
+
513
+ chrf_scores.append(chrf_score)
514
+
515
+ return sum(chrf_scores) / len(chrf_scores) if chrf_scores else 0
516
 
517
  def evaluate_readability(self, text):
518
+ # Simple readability metrics without textstat
519
+ words = re.findall(r'\b\w+\b', text.lower())
520
+ sentences = re.split(r'[.!?]+', text)
521
+
522
+ num_words = len(words)
523
+ num_sentences = len([s for s in sentences if s.strip()])
524
+
525
+ # Average word length
526
+ avg_word_length = sum(len(word) for word in words) / num_words if num_words else 0
527
+
528
+ # Words per sentence
529
+ words_per_sentence = num_words / num_sentences if num_sentences else 0
530
+
531
+ # Simplified Flesch Reading Ease approximation
532
+ flesch_ease = 206.835 - (1.015 * words_per_sentence) - (84.6 * avg_word_length)
533
+
534
+ # Simplified Flesch-Kincaid Grade Level approximation
535
+ flesch_grade = (0.39 * words_per_sentence) + (11.8 * avg_word_length) - 15.59
536
+
537
  return flesch_ease, flesch_grade
538
 
539
  def evaluate_mauve(self, reference_texts, generated_texts):