Spaces:

louiecerv
/

asl_dataset_uploader

Sleeping

App Files Files Community

louiecerv commited on Feb 11

Commit

19b0f3b

1 Parent(s): e22e88d

sync with remote

Browse files

Files changed (3) hide show

.gitignore +1 -0
app.py +183 -0
requirements.txt +5 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ data/

app.py ADDED Viewed

	@@ -0,0 +1,183 @@

+# data_uploader.py
+import streamlit as st
+import pandas as pd
+import numpy as np
+from PIL import Image
+import torch
+from torchvision import transforms
+from torch.utils.data import Dataset, DataLoader
+from datasets import Dataset as HFDataset, DatasetDict
+from huggingface_hub import HfApi  # For Hugging Face Hub interaction
+import os
+# Hugging Face Hub credentials
+HF_TOKEN = os.getenv("HF_TOKEN")
+REPO_ID = "louiecerv/american_sign_language"  # Replace with your dataset repo name
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+st.write(f"Enabled GPU = {torch.cuda.is_available()}")
+class MyDataset(Dataset):
+        def __init__(self, x_df, y_df):
+            self.xs = torch.tensor(x_df, dtype=torch.float32).to(device)  # Explicitly set dtype
+            self.ys = torch.tensor(y_df, dtype=torch.long).to(device) # Explicitly set dtype
+        def __getitem__(self, idx):
+            x = self.xs[idx]
+            y = self.ys[idx]
+            return x, y
+        def __len__(self):
+            return len(self.xs)
+# Load the dataset and convert to Hugging Face Dataset
+def load_and_convert_to_hf_dataset(x, y, split="train"):
+    df = pd.DataFrame({"image": list(x), "label": y})  # Create a DataFrame
+    hf_dataset = HFDataset.from_pandas(df)
+    # Preprocess images (Important for Hugging Face)
+    def preprocess_function(examples):
+        images = [np.array(img).reshape(28, 28) for img in examples["image"]] #Reshape the image
+        # Convert to PIL images and apply transformations
+        transformed_images = []
+        for image in images:
+            image = Image.fromarray(image.astype('uint8'))
+            transform = transforms.Compose([
+                transforms.Grayscale(num_output_channels=1),
+                transforms.Resize((28, 28)),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=[0.5], std=[0.5])
+            ])
+            transformed_image = transform(image)
+            transformed_images.append(transformed_image)
+        examples["pixel_values"] = torch.stack(transformed_images) #Stack the images into a tensor
+        return examples
+    hf_dataset = hf_dataset.map(preprocess_function, batched=True, remove_columns=["image"])
+    hf_dataset.set_format("torch")  # Set format to PyTorch
+    return hf_dataset
+# Load and convert dataframes to Hugging Face datasets
+train_df = pd.read_csv("data/asl_data/sign_mnist_train.csv")
+y_train = train_df.pop('label').values
+x_train = train_df.values
+valid_df = pd.read_csv("data/asl_data/sign_mnist_valid.csv")
+y_valid = valid_df.pop('label').values
+x_valid = valid_df.values
+def upload_dataset_to_hub(dataset, repo_id):
+    api = HfApi(token=HF_TOKEN)
+    api.create_repo(repo_id, repo_type="dataset", exist_ok=True)  # Create repo if it doesn't exist
+    dataset.push_to_hub(repo_id)
+    print(f"Dataset uploaded to {repo_id}")
+def main():
+    st.title("American Sign Language Dataset Uploader")
+    about = """
+## About This App
+This app is designed to load, preprocess, and upload datasets to the Hugging Face Hub. The main functionalities are encapsulated in the following components:
+### Custom Dataset Class
+The `MyDataset` class inherits from `torch.utils.data.Dataset` and is used to handle the dataset.
+- **Initialization (`__init__`)**:
+  - Converts input dataframes `x_df` and `y_df` to PyTorch tensors with explicit data types (`float32` for features and `long` for labels).
+  - Moves the tensors to the specified device (e.g., GPU).
+- **Get Item (`__getitem__`)**:
+  - Retrieves the feature (`x`) and label (`y`) tensors at a given index `idx`.
+- **Length (`__len__`)**:
+  - Returns the length of the dataset.
+### Load and Convert to Hugging Face Dataset
+The `load_and_convert_to_hf_dataset` function converts input data into a Hugging Face dataset.
+- **DataFrame Creation**:
+  - Creates a Pandas DataFrame from the input features (`x`) and labels (`y`).
+- **Preprocessing Function**:
+  - Reshapes images to 28x28 pixels.
+  - Converts images to PIL format and applies transformations (grayscale, resize, tensor conversion, and normalization).
+  - Stacks the transformed images into a tensor.
+- **Dataset Mapping**:
+  - Applies the preprocessing function to the dataset.
+  - Sets the dataset format to PyTorch.
+### Data Loading and Conversion
+The app loads training and validation data from CSV files and converts them into Hugging Face datasets.
+- **Training Data**:
+  - Loads data from `sign_mnist_train.csv`.
+  - Separates features and labels.
+  - Converts to a Hugging Face dataset.
+- **Validation Data**:
+  - Loads data from `sign_mnist_valid.csv`.
+  - Separates features and labels.
+  - Converts to a Hugging Face dataset.
+### Upload Dataset to Hugging Face Hub
+The `upload_dataset_to_hub` function uploads the dataset to the Hugging Face Hub.
+- **Repository Creation**:
+  - Creates a repository if it doesn't exist.
+- **Dataset Upload**:
+  - Pushes the dataset to the specified repository.
+### Main Function
+The `main` function orchestrates the entire process.
+- Loads and preprocesses training and validation data.
+- Creates a `DatasetDict` containing both datasets.
+- Uploads the dataset to the Hugging Face Hub.
+### Execution
+The script is executed by calling the `main` function if the script is run as the main module.
+```python
+if __name__ == "__main__":
+    main()"""
+    st.write("## Instructions")
+    st.write("Do not run this code on Huggingface.  Donwload the code and run it on your local machine.")
+    st.write("Make sure you have the required files in the data/asl_data folder.")
+    st.stop()
+    with st.expander("About", expanded=True):
+        st.write (about)
+    train_df = pd.read_csv("data/asl_data/sign_mnist_train.csv")
+    y_train = train_df.pop('label').values
+    x_train = train_df.values
+    train_dataset = load_and_convert_to_hf_dataset(x_train, y_train, "train")
+    valid_df = pd.read_csv("data/asl_data/sign_mnist_valid.csv")
+    y_valid = valid_df.pop('label').values
+    x_valid = valid_df.values
+    valid_dataset = load_and_convert_to_hf_dataset(x_valid, y_valid, "validation")
+    # Create a DatasetDict
+    full_dataset = DatasetDict({
+        "train": train_dataset,
+        "validation": valid_dataset
+    })
+    upload_dataset_to_hub(full_dataset, REPO_ID)  # Upload the DatasetDict
+    st.write("Data upload complete.")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch
+torchvision
+pandas
+matplotlib
+datasets