Spaces:
Runtime error
Runtime error
| # Create image # Install libraries | |
| # !pip install -q git+https://github.com/huggingface/peft.git transformers bitsandbytes datasets | |
| # # Fix fsspec version mismatch (if needed) | |
| # !pip install fsspec==2025.3.0 | |
| # from google.colab import files | |
| # files.upload() # Upload kaggle.json here | |
| # # Move kaggle.json to the right location | |
| # !mkdir -p ~/.kaggle | |
| # !cp kaggle.json ~/.kaggle/ | |
| # !chmod 600 ~/.kaggle/kaggle.json | |
| # # Download the dataset | |
| # !kaggle datasets download -d adityajn105/flickr8k --force | |
| # # Unzip it | |
| # !unzip -q flickr8k.zip -d flickr8k | |
| # DATASET_PATH = '/content/flickr8k' | |
| # CAPTIONS_FILE = os.path.join(DATASET_PATH, 'captions.txt') | |
| # IMAGES_PATH = os.path.join(DATASET_PATH, 'Images/') | |
| # import os | |
| # import pandas as pd | |
| # from PIL import Image | |
| # from torch.utils.data import Dataset, DataLoader | |
| # import torch | |
| # # Load and process captions | |
| # df = pd.read_csv(CAPTIONS_FILE, sep=',', names=["image", "caption"]) | |
| # df["caption"] = df["caption"][1:] | |
| # df["caption"] | |
| # df = df.dropna().reset_index(drop=True) | |
| # df | |
| # df = df[:8000] | |
| # from transformers import AutoProcessor | |
| # from PIL import Image | |
| # processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b") | |
| # class Flickr8kDataset(Dataset): | |
| # def __init__(self, dataframe, image_dir, processor): | |
| # self.dataframe = dataframe | |
| # self.image_dir = image_dir | |
| # self.processor = processor | |
| # def __len__(self): | |
| # return len(self.dataframe) | |
| # def __getitem__(self, idx): | |
| # row = self.dataframe.iloc[idx] | |
| # image_path = os.path.join(self.image_dir, row["image"]) | |
| # caption = row["caption"] | |
| # # Load image | |
| # image = Image.open(image_path).convert('RGB') | |
| # # Process image | |
| # encoding = self.processor(images=image, return_tensors="pt") | |
| # encoding = {k: v.squeeze() for k, v in encoding.items()} | |
| # encoding["text"] = caption | |
| # return encoding | |
| # def collate_fn(batch): | |
| # processed_batch = {} | |
| # for key in batch[0].keys(): | |
| # if key != "text": | |
| # processed_batch[key] = torch.stack([example[key] for example in batch]) | |
| # else: | |
| # text_inputs = processor.tokenizer( | |
| # [example["text"] for example in batch], padding=True, return_tensors="pt" | |
| # ) | |
| # processed_batch["input_ids"] = text_inputs["input_ids"] | |
| # processed_batch["attention_mask"] = text_inputs["attention_mask"] | |
| # return processed_batch | |
| # from transformers import Blip2ForConditionalGeneration | |
| # from peft import LoraConfig, get_peft_model | |
| # model = Blip2ForConditionalGeneration.from_pretrained( | |
| # "ybelkada/blip2-opt-2.7b-fp16-sharded", | |
| # device_map="auto", | |
| # load_in_8bit=True | |
| # ) | |
| # # Apply LoRA | |
| # config = LoraConfig( | |
| # r=16, | |
| # lora_alpha=32, | |
| # lora_dropout=0.05, | |
| # bias="none", | |
| # target_modules=["q_proj", "k_proj"] | |
| # ) | |
| # model = get_peft_model(model, config) | |
| # model.print_trainable_parameters() | |
| # # Load dataset | |
| # train_dataset = Flickr8kDataset(df, IMAGES_PATH, processor) | |
| # train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=3, collate_fn=collate_fn) | |
| # # Set up optimizer | |
| # optimizer = torch.optim.Adam(model.parameters(), lr=5e-4) | |
| # device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # model.train() | |
| # # Training loop | |
| # for epoch in range(1): # Use small epochs for testing, increase later | |
| # print(f"Epoch: {epoch}") | |
| # for idx, batch in enumerate(train_dataloader): | |
| # input_ids = batch.pop("input_ids").to(device) | |
| # pixel_values = batch.pop("pixel_values").to(device, torch.float16) | |
| # outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=input_ids) | |
| # loss = outputs.loss | |
| # print(f"Batch {idx} Loss: {loss.item():.4f}") | |
| # loss.backward() | |
| # optimizer.step() | |
| # optimizer.zero_grad() | |
| # # Example prediction | |
| # sample_image = Image.open(os.path.join(IMAGES_PATH, df.iloc[0]["image"])).convert('RGB') | |
| # inputs = processor(images=sample_image, return_tensors="pt").to(device, torch.float16) | |
| # generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=25) | |
| # caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
| # print("Generated caption:", caption) | |
| # import matplotlib.pyplot as plt | |
| # # Show the sample image with the generated caption | |
| # plt.figure(figsize=(6,6)) | |
| # plt.imshow(sample_image) | |
| # plt.axis("off") | |
| # plt.title(f"Generated caption:\n{caption}", fontsize=12) | |
| # plt.show() | |
| # # Load a sample image | |
| # sample_image = Image.open(os.path.join(IMAGES_PATH, df.iloc[15]["image"])).convert('RGB') | |
| # # Prepare inputs | |
| # inputs = processor(images=sample_image, return_tensors="pt").to(device, torch.float16) | |
| # generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=25) | |
| # # Decode caption | |
| # caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
| # print("Generated caption:", caption) | |
| # # Show image with caption | |
| # import matplotlib.pyplot as plt | |
| # plt.figure(figsize=(6,6)) | |
| # plt.imshow(sample_image) | |
| # plt.axis("off") | |
| # plt.title(f"Generated caption:\n{caption}", fontsize=12) | |
| # plt.show() | |
| # from PIL import Image | |
| # import matplotlib.pyplot as plt | |
| # import torch | |
| # import io | |
| # from google.colab import files # Only for Colab | |
| # # Upload image | |
| # uploaded = files.upload() | |
| # # Get the uploaded file | |
| # for filename in uploaded.keys(): | |
| # image_path = filename | |
| # # Load the image | |
| # sample_image = Image.open(image_path).convert('RGB') | |
| # # Prepare inputs | |
| # inputs = processor(images=sample_image, return_tensors="pt").to(device, torch.float16) | |
| # # Generate caption | |
| # generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=25) | |
| # # Decode caption | |
| # caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
| # print("Generated caption:", caption) | |
| # # Show image with caption | |
| # plt.figure(figsize=(6,6)) | |
| # plt.imshow(sample_image) | |
| # plt.axis("off") | |
| # plt.title(f"Generated caption:\n{caption}", fontsize=12) | |
| # plt.show() | |
| # !pip install evaluate pycocoevalcap --quiet | |
| # import evaluate | |
| # from tqdm import tqdm | |
| # from PIL import Image | |
| # import torch | |
| # import os | |
| # # Load metrics | |
| # bleu = evaluate.load("bleu") | |
| # df = pd.read_csv(CAPTIONS_FILE, sep=',', names=["image", "caption"]) | |
| # # Subset of data | |
| # subset_df = df[8001:8092].reset_index(drop=True) | |
| # # Prepare references and predictions | |
| # references = {} | |
| # predictions = [] | |
| # for idx in tqdm(range(len(subset_df))): | |
| # image_name = subset_df.iloc[idx]['image'] | |
| # # Load image | |
| # image_path = os.path.join(IMAGES_PATH, image_name) | |
| # image = Image.open(image_path).convert('RGB') | |
| # # Generate caption | |
| # inputs = processor(images=image, return_tensors="pt").to(device, torch.float16) | |
| # generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=25) | |
| # predicted_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
| # # Save prediction | |
| # predictions.append(predicted_caption) | |
| # # Prepare ground-truth references | |
| # if image_name not in references: | |
| # gt = df[df['image'] == image_name]['caption'].tolist() | |
| # references[image_name] = gt | |
| # # Build reference and prediction lists for scoring | |
| # gt_list = [references[name] for name in subset_df['image']] | |
| # pred_list = predictions | |
| # import evaluate | |
| # from tqdm import tqdm | |
| # from PIL import Image | |
| # from pycocoevalcap.cider.cider import Cider | |
| # from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer | |
| # bleu = evaluate.load("bleu") | |
| # df = pd.read_csv(CAPTIONS_FILE, sep=',', names=["image", "caption"]) | |
| # # Get subset | |
| # subset_df = df[8001:8092].reset_index(drop=True) | |
| # # Collect predictions and references | |
| # predictions = [] | |
| # references = {} | |
| # for idx in tqdm(range(len(subset_df))): | |
| # row = subset_df.iloc[idx] | |
| # image_name = row['image'] | |
| # image_path = os.path.join(IMAGES_PATH, image_name) | |
| # image = Image.open(image_path).convert('RGB') | |
| # inputs = processor(images=image, return_tensors="pt").to(device, torch.float16) | |
| # generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=25) | |
| # caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
| # predictions.append(caption) | |
| # if image_name not in references: | |
| # refs = df[df['image'] == image_name]['caption'].tolist() | |
| # references[image_name] = refs | |
| # # Prepare for BLEU | |
| # gt_list = [references[name] for name in subset_df["image"]] | |
| # pred_list = predictions | |
| # bleu_score = bleu.compute(predictions=pred_list, references=gt_list) | |
| # print("BLEU:", bleu_score) | |
| # # Prepare COCO-style input | |
| # gts = {} | |
| # res = {} | |
| # for i, img in enumerate(subset_df["image"]): | |
| # gts[str(i)] = [{"caption": cap} for cap in references[img]] | |
| # res[str(i)] = [{"caption": predictions[i]}] | |
| # # Tokenize | |
| # tokenizer = PTBTokenizer() | |
| # gts_tokenized = tokenizer.tokenize(gts) | |
| # res_tokenized = tokenizer.tokenize(res) | |
| # # Compute CIDEr | |
| # cider_scorer = Cider() | |
| # cider_score, _ = cider_scorer.compute_score(gts_tokenized, res_tokenized) | |
| # print("CIDEr:", cider_score) | |