Spaces:
Runtime error
Runtime error
File size: 4,682 Bytes
85dd768 7b1c689 85dd768 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch
import torch.nn as nn
# Preprocess reviews
reviews_path = "data_reviews.txt"
with open(reviews_path, "r") as reviews_raw:
reviews = reviews_raw.readlines()
reviews = [review.replace("TL;DR", " TL;DR ") for review in reviews]
max_length = 200
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
optimizer = optim.AdamW(model.parameters(), lr=3e-4)
extra_length = len(tokenizer.encode(" TL;DR "))
from peft import LoraConfig, get_peft_model
config = LoraConfig(
r=512,
lora_alpha=32,
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
model = get_peft_model(model, config)
model = model.to('cuda')
class ReviewDataset(Dataset):
def __init__(self, tokenizer, reviews, max_len):
self.max_len = max_len
self.tokenizer = tokenizer
self.eos = self.tokenizer.eos_token
self.eos_id = self.tokenizer.eos_token_id
self.reviews = reviews
self.result = []
for review in self.reviews:
# Encode the text using tokenizer.encode(). Add EOS at the end
tokenized = self.tokenizer.encode(review + self.eos)
# Padding/truncating the encoded sequence to max_len
padded = self.pad_truncate(tokenized)
# Creating a tensor and adding to the result
self.result.append(torch.tensor(padded))
def __len__(self):
return len(self.result)
def __getitem__(self, item):
return self.result[item]
def pad_truncate(self, name):
name_length = len(name) - extra_length
if name_length < self.max_len:
difference = self.max_len - name_length
result = name + [self.eos_id] * difference
elif name_length > self.max_len:
result = name[:self.max_len + 3]+[self.eos_id]
else:
result = name
return result
dataset = ReviewDataset(tokenizer, reviews, max_length)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, drop_last=True)
epochs = 2
for epoch in range(epochs):
for batch in dataloader:
with torch.set_grad_enabled(True):
optimizer.zero_grad()
batch = batch.to('cuda')
output = model(batch, labels=batch)
loss = output.loss
loss.backward()
optimizer.step()
print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')
import numpy as np
import random
def topk(probs, n=9):
# The scores are initially softmaxed to convert to probabilities
probs = torch.softmax(probs, dim= -1)
# PyTorch has its own topk method, which we use here
tokensProb, topIx = torch.topk(probs, k=n)
# The new selection pool (9 choices) is normalized
tokensProb = tokensProb / torch.sum(tokensProb)
# Send to CPU for numpy handling
tokensProb = tokensProb.cpu().detach().numpy()
# Make a random choice from the pool based on the new prob distribution
choice = np.random.choice(n, 1, p = tokensProb)
tokenId = topIx[choice][0]
return int(tokenId)
def model_infer(model, tokenizer, review, max_length=15):
# Preprocess the init token (task designator)
review_encoded = tokenizer.encode(review)
result = review_encoded
initial_input = torch.tensor(review_encoded).unsqueeze(0).to('cuda')
with torch.set_grad_enabled(False):
# Feed the init token to the model
output = model(initial_input)
# Flatten the logits at the final time step
logits = output.logits[0,-1]
# Make a top-k choice and append to the result
result.append(topk(logits))
# For max_length times:
for _ in range(max_length):
# Feed the current sequence to the model and make a choice
input = torch.tensor(result).unsqueeze(0).to('cuda')
output = model(input)
logits = output.logits[0,-1]
res_id = topk(logits)
# If the chosen token is EOS, return the result
if res_id == tokenizer.eos_token_id:
return tokenizer.decode(result)
else: # Append to the sequence
result.append(res_id)
# IF no EOS is generated, return after the max_len
return tokenizer.decode(result)
import gradio as gr
def summarize(review):
summary = model_infer(model, tokenizer, review + " TL;DR ")
return summary.split(" TL;DR ")[1].strip()
iface = gr.Interface(fn=summarize, inputs="text", outputs="text")
iface.launch()
|