Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -10,9 +10,12 @@ from huggingface_hub import login
|
|
| 10 |
api_token = os.getenv('HF_TOKEN')
|
| 11 |
|
| 12 |
# Load pre-trained model and tokenizer
|
| 13 |
-
model_name = "gpt2"
|
| 14 |
model = GPT2LMHeadModel.from_pretrained(model_name)
|
| 15 |
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
|
|
|
|
|
|
|
|
|
|
| 16 |
model.eval()
|
| 17 |
|
| 18 |
def create_ngrams(tokens, n):
|
|
@@ -46,10 +49,12 @@ def kneser_ney_smoothing(ngram_counts, lower_order_counts, discount=0.75):
|
|
| 46 |
return probabilities
|
| 47 |
|
| 48 |
def generate_text_with_probs(initial_context, top_p, max_length, top_k):
|
| 49 |
-
input_ids = tokenizer.encode(initial_context, return_tensors="pt")
|
| 50 |
generated_text = initial_context
|
| 51 |
token_tables = []
|
| 52 |
|
|
|
|
|
|
|
| 53 |
with torch.no_grad():
|
| 54 |
for _ in range(max_length):
|
| 55 |
outputs = model(input_ids=input_ids)
|
|
@@ -62,42 +67,35 @@ def generate_text_with_probs(initial_context, top_p, max_length, top_k):
|
|
| 62 |
sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
|
| 63 |
sorted_indices_to_remove[..., 0] = 0
|
| 64 |
|
| 65 |
-
# Convert boolean mask to indices to set logits to -inf
|
| 66 |
indices_to_remove = sorted_indices[sorted_indices_to_remove]
|
| 67 |
next_token_logits[:, indices_to_remove] = -float('Inf')
|
| 68 |
-
|
| 69 |
-
# Compute probabilities
|
| 70 |
probabilities = torch.softmax(next_token_logits, dim=-1)
|
| 71 |
|
| 72 |
-
# Get the next token using multinomial sampling
|
| 73 |
next_token = torch.multinomial(probabilities, num_samples=1)
|
| 74 |
-
|
| 75 |
-
# Get next token and its probability
|
| 76 |
next_token_prob = probabilities[0, next_token].item()
|
| 77 |
next_token_text = tokenizer.decode(next_token.item())
|
| 78 |
|
| 79 |
-
|
| 80 |
-
top_tokens = sorted_indices[0, :top_k] # Get top k tokens
|
| 81 |
top_probs = probabilities[0, top_tokens]
|
| 82 |
top_token_probs = [(tokenizer.decode([token.item()]), prob.item()) for token, prob in zip(top_tokens, top_probs)]
|
| 83 |
|
| 84 |
-
# Create DataFrame for current token's top-k probabilities
|
| 85 |
df = pd.DataFrame(top_token_probs, columns=["Token", "Probability"])
|
| 86 |
-
df.index = df.index + 1
|
| 87 |
-
token_tables.append((f"Next token: {next_token_text} (Probability: {next_token_prob:.
|
|
|
|
| 88 |
|
| 89 |
-
# Add the next token to the input_ids
|
| 90 |
input_ids = torch.cat([input_ids, next_token], dim=-1)
|
| 91 |
|
| 92 |
if next_token.item() == tokenizer.eos_token_id:
|
| 93 |
break
|
| 94 |
|
| 95 |
-
# Decode the generated text
|
| 96 |
generated_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
|
| 97 |
|
| 98 |
-
return generated_text, token_tables
|
| 99 |
|
| 100 |
def predict_next_token_ngram(input_text, context_text, max_length):
|
|
|
|
|
|
|
| 101 |
context_tokens = tokenizer.tokenize(context_text)
|
| 102 |
four_grams = create_ngrams(context_tokens, 4)
|
| 103 |
four_gram_counts = Counter(four_grams)
|
|
@@ -106,14 +104,17 @@ def predict_next_token_ngram(input_text, context_text, max_length):
|
|
| 106 |
probs = kneser_ney_smoothing(four_gram_counts, three_gram_counts)
|
| 107 |
|
| 108 |
input_tokens = tokenizer.tokenize(input_text)
|
|
|
|
| 109 |
generated_text = input_text
|
| 110 |
token_tables = []
|
| 111 |
|
| 112 |
-
if len(input_tokens) >= max_length:
|
| 113 |
generated_text = tokenizer.convert_tokens_to_string(input_tokens)
|
| 114 |
return generated_text, token_tables
|
| 115 |
|
| 116 |
-
|
|
|
|
|
|
|
| 117 |
input_3_gram = tuple(input_tokens[-3:])
|
| 118 |
next_token_probs = probs.get(input_3_gram, {})
|
| 119 |
if not next_token_probs:
|
|
@@ -121,17 +122,17 @@ def predict_next_token_ngram(input_text, context_text, max_length):
|
|
| 121 |
next_token = max(next_token_probs, key=next_token_probs.get)
|
| 122 |
input_tokens.append(next_token)
|
| 123 |
|
| 124 |
-
# Get top tokens and their probabilities
|
| 125 |
top_k = 4
|
| 126 |
top_k_tokens = sorted(next_token_probs.items(), key=lambda x: x[1], reverse=True)[:top_k]
|
| 127 |
top_k_tokens_df = pd.DataFrame(top_k_tokens, columns=["Token", "Probability"])
|
| 128 |
top_k_tokens_df.index = top_k_tokens_df.index + 1 # Add numbering to the DataFrame
|
| 129 |
top_k_tokens_df["Token"] = top_k_tokens_df["Token"].apply(lambda x: tokenizer.convert_tokens_to_string([x]))
|
| 130 |
|
| 131 |
-
token_tables.append((f"Next token: {next_token}
|
|
|
|
| 132 |
|
| 133 |
generated_text = tokenizer.convert_tokens_to_string(input_tokens)
|
| 134 |
-
return generated_text, token_tables
|
| 135 |
|
| 136 |
def combined_model_predictions(context_text, initial_context, top_p, max_length, top_k):
|
| 137 |
generated_text, token_tables = generate_text_with_probs(initial_context, top_p, max_length, top_k)
|
|
@@ -146,17 +147,15 @@ iface = gr.Interface(
|
|
| 146 |
gr.Textbox(lines=2, placeholder="Enter initial context here..."),
|
| 147 |
gr.Slider(0, 1, step=0.01, value=0.9, label="Top-p (nucleus) sampling"),
|
| 148 |
gr.Slider(1, 100, step=1, value=50, label="Max length"),
|
| 149 |
-
gr.Slider(1, 50, step=1, value=10, label="Top-k"),
|
| 150 |
],
|
| 151 |
outputs=[
|
| 152 |
gr.Textbox(label="Generated Text"),
|
| 153 |
-
gr.Dataframe(label="LLM Token Probabilities"),
|
| 154 |
gr.Textbox(label="N-gram Generated Text"),
|
| 155 |
-
gr.Dataframe(label="N-gram Token Predictions"),
|
| 156 |
],
|
| 157 |
-
title="Next Token Visualizer (GPT-2 -
|
| 158 |
-
description="Generate text using GPT-2 with top-p (nucleus) sampling and see the probabilities of generated tokens in tables, along with N-gram model predictions.",
|
| 159 |
)
|
| 160 |
|
| 161 |
-
# Launch the Gradio app
|
| 162 |
iface.launch()
|
|
|
|
| 10 |
api_token = os.getenv('HF_TOKEN')
|
| 11 |
|
| 12 |
# Load pre-trained model and tokenizer
|
| 13 |
+
model_name = "gpt2-large"
|
| 14 |
model = GPT2LMHeadModel.from_pretrained(model_name)
|
| 15 |
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
|
| 16 |
+
|
| 17 |
+
device = torch.device("mps") if torch.has_mps else torch.device("cpu")
|
| 18 |
+
model.to(device)
|
| 19 |
model.eval()
|
| 20 |
|
| 21 |
def create_ngrams(tokens, n):
|
|
|
|
| 49 |
return probabilities
|
| 50 |
|
| 51 |
def generate_text_with_probs(initial_context, top_p, max_length, top_k):
|
| 52 |
+
input_ids = tokenizer.encode(initial_context, return_tensors="pt").to(device)
|
| 53 |
generated_text = initial_context
|
| 54 |
token_tables = []
|
| 55 |
|
| 56 |
+
token_no = 1
|
| 57 |
+
|
| 58 |
with torch.no_grad():
|
| 59 |
for _ in range(max_length):
|
| 60 |
outputs = model(input_ids=input_ids)
|
|
|
|
| 67 |
sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
|
| 68 |
sorted_indices_to_remove[..., 0] = 0
|
| 69 |
|
|
|
|
| 70 |
indices_to_remove = sorted_indices[sorted_indices_to_remove]
|
| 71 |
next_token_logits[:, indices_to_remove] = -float('Inf')
|
|
|
|
|
|
|
| 72 |
probabilities = torch.softmax(next_token_logits, dim=-1)
|
| 73 |
|
|
|
|
| 74 |
next_token = torch.multinomial(probabilities, num_samples=1)
|
|
|
|
|
|
|
| 75 |
next_token_prob = probabilities[0, next_token].item()
|
| 76 |
next_token_text = tokenizer.decode(next_token.item())
|
| 77 |
|
| 78 |
+
top_tokens = sorted_indices[0, :top_k]
|
|
|
|
| 79 |
top_probs = probabilities[0, top_tokens]
|
| 80 |
top_token_probs = [(tokenizer.decode([token.item()]), prob.item()) for token, prob in zip(top_tokens, top_probs)]
|
| 81 |
|
|
|
|
| 82 |
df = pd.DataFrame(top_token_probs, columns=["Token", "Probability"])
|
| 83 |
+
df.index = df.index + 1
|
| 84 |
+
token_tables.append((f"{token_no}>> Next token: {next_token_text} (Probability: {next_token_prob:.8f})", df))
|
| 85 |
+
token_no+=1
|
| 86 |
|
|
|
|
| 87 |
input_ids = torch.cat([input_ids, next_token], dim=-1)
|
| 88 |
|
| 89 |
if next_token.item() == tokenizer.eos_token_id:
|
| 90 |
break
|
| 91 |
|
|
|
|
| 92 |
generated_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
|
| 93 |
|
| 94 |
+
return generated_text[len(initial_context):], token_tables
|
| 95 |
|
| 96 |
def predict_next_token_ngram(input_text, context_text, max_length):
|
| 97 |
+
|
| 98 |
+
ip = input_text
|
| 99 |
context_tokens = tokenizer.tokenize(context_text)
|
| 100 |
four_grams = create_ngrams(context_tokens, 4)
|
| 101 |
four_gram_counts = Counter(four_grams)
|
|
|
|
| 104 |
probs = kneser_ney_smoothing(four_gram_counts, three_gram_counts)
|
| 105 |
|
| 106 |
input_tokens = tokenizer.tokenize(input_text)
|
| 107 |
+
generated_tokens = input_tokens.copy()
|
| 108 |
generated_text = input_text
|
| 109 |
token_tables = []
|
| 110 |
|
| 111 |
+
if len(input_tokens) >= (max_length + len(generated_tokens)):
|
| 112 |
generated_text = tokenizer.convert_tokens_to_string(input_tokens)
|
| 113 |
return generated_text, token_tables
|
| 114 |
|
| 115 |
+
token_no = 1
|
| 116 |
+
|
| 117 |
+
while len(input_tokens) < (max_length + len(generated_tokens)):
|
| 118 |
input_3_gram = tuple(input_tokens[-3:])
|
| 119 |
next_token_probs = probs.get(input_3_gram, {})
|
| 120 |
if not next_token_probs:
|
|
|
|
| 122 |
next_token = max(next_token_probs, key=next_token_probs.get)
|
| 123 |
input_tokens.append(next_token)
|
| 124 |
|
|
|
|
| 125 |
top_k = 4
|
| 126 |
top_k_tokens = sorted(next_token_probs.items(), key=lambda x: x[1], reverse=True)[:top_k]
|
| 127 |
top_k_tokens_df = pd.DataFrame(top_k_tokens, columns=["Token", "Probability"])
|
| 128 |
top_k_tokens_df.index = top_k_tokens_df.index + 1 # Add numbering to the DataFrame
|
| 129 |
top_k_tokens_df["Token"] = top_k_tokens_df["Token"].apply(lambda x: tokenizer.convert_tokens_to_string([x]))
|
| 130 |
|
| 131 |
+
token_tables.append((f"{token_no}>> Next token: {next_token}", top_k_tokens_df))
|
| 132 |
+
token_no+=1
|
| 133 |
|
| 134 |
generated_text = tokenizer.convert_tokens_to_string(input_tokens)
|
| 135 |
+
return generated_text[len(ip):], token_tables
|
| 136 |
|
| 137 |
def combined_model_predictions(context_text, initial_context, top_p, max_length, top_k):
|
| 138 |
generated_text, token_tables = generate_text_with_probs(initial_context, top_p, max_length, top_k)
|
|
|
|
| 147 |
gr.Textbox(lines=2, placeholder="Enter initial context here..."),
|
| 148 |
gr.Slider(0, 1, step=0.01, value=0.9, label="Top-p (nucleus) sampling"),
|
| 149 |
gr.Slider(1, 100, step=1, value=50, label="Max length"),
|
| 150 |
+
gr.Slider(1, 50, step=1, value=10, label="Top-k"),
|
| 151 |
],
|
| 152 |
outputs=[
|
| 153 |
gr.Textbox(label="Generated Text"),
|
| 154 |
+
gr.Dataframe(label="LLM Token Probabilities"),
|
| 155 |
gr.Textbox(label="N-gram Generated Text"),
|
| 156 |
+
gr.Dataframe(label="N-gram Token Predictions"),
|
| 157 |
],
|
| 158 |
+
title="Next Token Visualizer (GPT-2-large - 812M param.)"
|
|
|
|
| 159 |
)
|
| 160 |
|
|
|
|
| 161 |
iface.launch()
|