maahi2412 commited on
Commit
58013a0
·
verified ·
1 Parent(s): f928108

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -10
app.py CHANGED
@@ -37,19 +37,15 @@ def load_or_finetune_pegasus():
37
  tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
38
  model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
39
 
40
- cnn_dm = load_dataset("cnn_dailymail", "3.0.0", split="train[:5000]")
41
- xsum = load_dataset("xsum", split="train[:5000]", trust_remote_code=True)
 
42
  combined_dataset = concatenate_datasets([cnn_dm, xsum])
43
 
44
  def preprocess_function(examples):
45
- # Ensure texts and summaries are lists of strings
46
- texts = examples["article"] if "article" in examples else examples["document"]
47
- summaries = examples["highlights"] if "highlights" in examples else examples["summary"]
48
-
49
- # Tokenize inputs and targets
50
- inputs = tokenizer(texts, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
51
- targets = tokenizer(summaries, max_length=400, truncation=True, padding="max_length", return_tensors="pt")
52
-
53
  inputs["labels"] = targets["input_ids"]
54
  return inputs
55
 
 
37
  tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
38
  model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
39
 
40
+ # Load and normalize datasets
41
+ cnn_dm = load_dataset("cnn_dailymail", "3.0.0", split="train[:5000]").rename_column("article", "text").rename_column("highlights", "summary")
42
+ xsum = load_dataset("xsum", split="train[:5000]", trust_remote_code=True).rename_column("document", "text")
43
  combined_dataset = concatenate_datasets([cnn_dm, xsum])
44
 
45
  def preprocess_function(examples):
46
+ # Directly use normalized 'text' and 'summary' fields
47
+ inputs = tokenizer(examples["text"], max_length=512, truncation=True, padding="max_length", return_tensors="pt")
48
+ targets = tokenizer(examples["summary"], max_length=400, truncation=True, padding="max_length", return_tensors="pt")
 
 
 
 
 
49
  inputs["labels"] = targets["input_ids"]
50
  return inputs
51