from fastapi import FastAPI from datasets import load_dataset from transformers import AutoTokenizer from transformers import DataCollatorForSeq2Seq import evaluate import numpy as np from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer app = FastAPI() @app.get("/") def summarize(): # Example: Loading a dataset as part of the API billsum = load_dataset("billsum", split="ca_test") billsum = billsum.train_test_split(test_size=0.2) # import pandas as pd # df = pd.read_csv("squad_sample_train.tsv", sep="\t") # print(df.head()) # Debugging step # return {"Hello": "World!", "dataset_length": len(billsum)} # return df.head() checkpoint = "google-t5/t5-small" tokenizer = AutoTokenizer.from_pretrained(checkpoint) prefix = "summarize: " def preprocess_function(examples): inputs = [prefix + doc for doc in examples["text"]] model_inputs = tokenizer(inputs, max_length=1024, truncation=True) labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True) model_inputs["labels"] = labels["input_ids"] return model_inputs tokenized_billsum = billsum.map(preprocess_function, batched=True) data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint) rouge = evaluate.load("rouge") def compute_metrics(eval_pred): predictions, labels = eval_pred decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True) prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions] result["gen_len"] = np.mean(prediction_lens) return {k: round(v, 4) for k, v in result.items()} model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint) """training_args = Seq2SeqTrainingArguments( output_dir="./results", logging_dir="./logs", # Save logs here eval_strategy="steps", learning_rate=2e-5, per_device_train_batch_size=16, # Increase batch size per_device_eval_batch_size=16, weight_decay=0.01, save_total_limit=3, num_train_epochs=1, # Reduce epochs predict_with_generate=True, fp16=True, # Keep mixed precision push_to_hub=False, # optim="adamw_bnb_8bit", # Use 8-bit optimizer logging_steps=10, # Log every 10 steps logging_strategy="steps", dataloader_num_workers=4, # Speed up data loading save_strategy="epoch", # Reduce checkpointing overhead save_steps=500, gradient_accumulation_steps=4 # Effective larger batch size )""" training_args = Seq2SeqTrainingArguments( output_dir="./tmp_test", # Temporary output directory max_steps=2, # Run only 2 steps per_device_train_batch_size=1, # Smallest batch size per_device_eval_batch_size=1, # Smallest batch size evaluation_strategy="no", # No evaluation for speed save_strategy="no", # No checkpoint saving logging_strategy="no", # No logging ) trainer = Seq2SeqTrainer( model=model, args=training_args, train_dataset=tokenized_billsum["train"], eval_dataset=tokenized_billsum["test"], processing_class=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, ) return trainer.train() # return data_collator # return type(tokenized_billsum) """from fastapi import FastAPI from datasets import load_dataset from transformers import AutoTokenizer app = FastAPI() #@app.get("/") # Load dataset and tokenizer billsum = load_dataset("billsum", split="ca_test") # Load a small sample tokenizer = AutoTokenizer.from_pretrained("t5-small") prefix = "summarize: " # Example prefix for text generation @app.get("/") def preprocess_function(examples): inputs = [prefix + doc for doc in examples["text"]] model_inputs = tokenizer(inputs, max_length=1024, truncation=True) labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True) model_inputs["labels"] = labels["input_ids"] return model_inputs #@app.get("/") def get_tokenized_data(): tokenized_billsum = billsum.map(preprocess_function, batched=True) # Convert to list of dictionaries json_serializable_output = tokenized_billsum.to_pandas().to_dict(orient="records") return {"tokenized_data": json_serializable_output} # Ensure JSON format"""