Spaces:
Sleeping
Sleeping
from fastapi import FastAPI | |
from datasets import load_dataset | |
from transformers import AutoTokenizer | |
from transformers import DataCollatorForSeq2Seq | |
import evaluate | |
import numpy as np | |
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer | |
app = FastAPI() | |
def summarize(): | |
# Example: Loading a dataset as part of the API | |
billsum = load_dataset("billsum", split="ca_test") | |
# import pandas as pd | |
# df = pd.read_csv("squad_sample_train.tsv", sep="\t") | |
# print(df.head()) # Debugging step | |
# return {"Hello": "World!", "dataset_length": len(billsum)} | |
# return df.head() | |
checkpoint = "google-t5/t5-small" | |
tokenizer = AutoTokenizer.from_pretrained(checkpoint) | |
prefix = "summarize: " | |
def preprocess_function(examples): | |
inputs = [prefix + doc for doc in examples["text"]] | |
model_inputs = tokenizer(inputs, max_length=1024, truncation=True) | |
labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True) | |
model_inputs["labels"] = labels["input_ids"] | |
return model_inputs | |
tokenized_billsum = billsum.map(preprocess_function, batched=True) | |
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint) | |
rouge = evaluate.load("rouge") | |
def compute_metrics(eval_pred): | |
predictions, labels = eval_pred | |
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) | |
labels = np.where(labels != -100, labels, tokenizer.pad_token_id) | |
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) | |
result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True) | |
prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions] | |
result["gen_len"] = np.mean(prediction_lens) | |
return {k: round(v, 4) for k, v in result.items()} | |
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint) | |
training_args = Seq2SeqTrainingArguments( | |
output_dir="my_awesome_billsum_model", | |
eval_strategy="no", | |
learning_rate=2e-5, | |
per_device_train_batch_size=16, # Increase batch size | |
per_device_eval_batch_size=16, | |
weight_decay=0.01, | |
save_total_limit=3, | |
num_train_epochs=1, # Reduce epochs | |
predict_with_generate=True, | |
fp16=True, # Keep mixed precision | |
push_to_hub=False, | |
# optim="adamw_bnb_8bit", # Use 8-bit optimizer | |
logging_steps=100, # Reduce logging overhead | |
dataloader_num_workers=4, # Speed up data loading | |
save_strategy="epoch", # Reduce checkpointing overhead | |
gradient_accumulation_steps=4 # Effective larger batch size | |
) | |
trainer = Seq2SeqTrainer( | |
model=model, | |
args=training_args, | |
train_dataset=tokenized_billsum["train"], | |
eval_dataset=tokenized_billsum["test"], | |
processing_class=tokenizer, | |
data_collator=data_collator, | |
compute_metrics=compute_metrics, | |
) | |
trainer.train() | |
return data_collator | |
# return type(tokenized_billsum) | |
"""from fastapi import FastAPI | |
from datasets import load_dataset | |
from transformers import AutoTokenizer | |
app = FastAPI() | |
#@app.get("/") | |
# Load dataset and tokenizer | |
billsum = load_dataset("billsum", split="ca_test") # Load a small sample | |
tokenizer = AutoTokenizer.from_pretrained("t5-small") | |
prefix = "summarize: " # Example prefix for text generation | |
@app.get("/") | |
def preprocess_function(examples): | |
inputs = [prefix + doc for doc in examples["text"]] | |
model_inputs = tokenizer(inputs, max_length=1024, truncation=True) | |
labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True) | |
model_inputs["labels"] = labels["input_ids"] | |
return model_inputs | |
#@app.get("/") | |
def get_tokenized_data(): | |
tokenized_billsum = billsum.map(preprocess_function, batched=True) | |
# Convert to list of dictionaries | |
json_serializable_output = tokenized_billsum.to_pandas().to_dict(orient="records") | |
return {"tokenized_data": json_serializable_output} # Ensure JSON format""" | |