Spaces:
Sleeping
Sleeping
sync with remote
Browse files- .gitignore +1 -0
- app.py +183 -0
- requirements.txt +5 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
data/
|
app.py
ADDED
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# data_uploader.py
|
2 |
+
import streamlit as st
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
+
from PIL import Image
|
6 |
+
import torch
|
7 |
+
from torchvision import transforms
|
8 |
+
from torch.utils.data import Dataset, DataLoader
|
9 |
+
from datasets import Dataset as HFDataset, DatasetDict
|
10 |
+
from huggingface_hub import HfApi # For Hugging Face Hub interaction
|
11 |
+
import os
|
12 |
+
|
13 |
+
# Hugging Face Hub credentials
|
14 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
15 |
+
REPO_ID = "louiecerv/american_sign_language" # Replace with your dataset repo name
|
16 |
+
|
17 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
18 |
+
st.write(f"Enabled GPU = {torch.cuda.is_available()}")
|
19 |
+
|
20 |
+
class MyDataset(Dataset):
|
21 |
+
def __init__(self, x_df, y_df):
|
22 |
+
self.xs = torch.tensor(x_df, dtype=torch.float32).to(device) # Explicitly set dtype
|
23 |
+
self.ys = torch.tensor(y_df, dtype=torch.long).to(device) # Explicitly set dtype
|
24 |
+
|
25 |
+
def __getitem__(self, idx):
|
26 |
+
x = self.xs[idx]
|
27 |
+
y = self.ys[idx]
|
28 |
+
return x, y
|
29 |
+
|
30 |
+
def __len__(self):
|
31 |
+
return len(self.xs)
|
32 |
+
|
33 |
+
# Load the dataset and convert to Hugging Face Dataset
|
34 |
+
def load_and_convert_to_hf_dataset(x, y, split="train"):
|
35 |
+
df = pd.DataFrame({"image": list(x), "label": y}) # Create a DataFrame
|
36 |
+
hf_dataset = HFDataset.from_pandas(df)
|
37 |
+
|
38 |
+
# Preprocess images (Important for Hugging Face)
|
39 |
+
def preprocess_function(examples):
|
40 |
+
images = [np.array(img).reshape(28, 28) for img in examples["image"]] #Reshape the image
|
41 |
+
# Convert to PIL images and apply transformations
|
42 |
+
transformed_images = []
|
43 |
+
for image in images:
|
44 |
+
image = Image.fromarray(image.astype('uint8'))
|
45 |
+
transform = transforms.Compose([
|
46 |
+
transforms.Grayscale(num_output_channels=1),
|
47 |
+
transforms.Resize((28, 28)),
|
48 |
+
transforms.ToTensor(),
|
49 |
+
transforms.Normalize(mean=[0.5], std=[0.5])
|
50 |
+
])
|
51 |
+
transformed_image = transform(image)
|
52 |
+
transformed_images.append(transformed_image)
|
53 |
+
examples["pixel_values"] = torch.stack(transformed_images) #Stack the images into a tensor
|
54 |
+
return examples
|
55 |
+
|
56 |
+
hf_dataset = hf_dataset.map(preprocess_function, batched=True, remove_columns=["image"])
|
57 |
+
hf_dataset.set_format("torch") # Set format to PyTorch
|
58 |
+
return hf_dataset
|
59 |
+
|
60 |
+
# Load and convert dataframes to Hugging Face datasets
|
61 |
+
train_df = pd.read_csv("data/asl_data/sign_mnist_train.csv")
|
62 |
+
y_train = train_df.pop('label').values
|
63 |
+
x_train = train_df.values
|
64 |
+
|
65 |
+
valid_df = pd.read_csv("data/asl_data/sign_mnist_valid.csv")
|
66 |
+
y_valid = valid_df.pop('label').values
|
67 |
+
x_valid = valid_df.values
|
68 |
+
|
69 |
+
def upload_dataset_to_hub(dataset, repo_id):
|
70 |
+
api = HfApi(token=HF_TOKEN)
|
71 |
+
api.create_repo(repo_id, repo_type="dataset", exist_ok=True) # Create repo if it doesn't exist
|
72 |
+
dataset.push_to_hub(repo_id)
|
73 |
+
print(f"Dataset uploaded to {repo_id}")
|
74 |
+
|
75 |
+
|
76 |
+
def main():
|
77 |
+
st.title("American Sign Language Dataset Uploader")
|
78 |
+
|
79 |
+
about = """
|
80 |
+
## About This App
|
81 |
+
|
82 |
+
This app is designed to load, preprocess, and upload datasets to the Hugging Face Hub. The main functionalities are encapsulated in the following components:
|
83 |
+
|
84 |
+
### Custom Dataset Class
|
85 |
+
|
86 |
+
The `MyDataset` class inherits from `torch.utils.data.Dataset` and is used to handle the dataset.
|
87 |
+
|
88 |
+
- **Initialization (`__init__`)**:
|
89 |
+
- Converts input dataframes `x_df` and `y_df` to PyTorch tensors with explicit data types (`float32` for features and `long` for labels).
|
90 |
+
- Moves the tensors to the specified device (e.g., GPU).
|
91 |
+
|
92 |
+
- **Get Item (`__getitem__`)**:
|
93 |
+
- Retrieves the feature (`x`) and label (`y`) tensors at a given index `idx`.
|
94 |
+
|
95 |
+
- **Length (`__len__`)**:
|
96 |
+
- Returns the length of the dataset.
|
97 |
+
|
98 |
+
### Load and Convert to Hugging Face Dataset
|
99 |
+
|
100 |
+
The `load_and_convert_to_hf_dataset` function converts input data into a Hugging Face dataset.
|
101 |
+
|
102 |
+
- **DataFrame Creation**:
|
103 |
+
- Creates a Pandas DataFrame from the input features (`x`) and labels (`y`).
|
104 |
+
|
105 |
+
- **Preprocessing Function**:
|
106 |
+
- Reshapes images to 28x28 pixels.
|
107 |
+
- Converts images to PIL format and applies transformations (grayscale, resize, tensor conversion, and normalization).
|
108 |
+
- Stacks the transformed images into a tensor.
|
109 |
+
|
110 |
+
- **Dataset Mapping**:
|
111 |
+
- Applies the preprocessing function to the dataset.
|
112 |
+
- Sets the dataset format to PyTorch.
|
113 |
+
|
114 |
+
### Data Loading and Conversion
|
115 |
+
|
116 |
+
The app loads training and validation data from CSV files and converts them into Hugging Face datasets.
|
117 |
+
|
118 |
+
- **Training Data**:
|
119 |
+
- Loads data from `sign_mnist_train.csv`.
|
120 |
+
- Separates features and labels.
|
121 |
+
- Converts to a Hugging Face dataset.
|
122 |
+
|
123 |
+
- **Validation Data**:
|
124 |
+
- Loads data from `sign_mnist_valid.csv`.
|
125 |
+
- Separates features and labels.
|
126 |
+
- Converts to a Hugging Face dataset.
|
127 |
+
|
128 |
+
### Upload Dataset to Hugging Face Hub
|
129 |
+
|
130 |
+
The `upload_dataset_to_hub` function uploads the dataset to the Hugging Face Hub.
|
131 |
+
|
132 |
+
- **Repository Creation**:
|
133 |
+
- Creates a repository if it doesn't exist.
|
134 |
+
|
135 |
+
- **Dataset Upload**:
|
136 |
+
- Pushes the dataset to the specified repository.
|
137 |
+
|
138 |
+
### Main Function
|
139 |
+
|
140 |
+
The `main` function orchestrates the entire process.
|
141 |
+
|
142 |
+
- Loads and preprocesses training and validation data.
|
143 |
+
- Creates a `DatasetDict` containing both datasets.
|
144 |
+
- Uploads the dataset to the Hugging Face Hub.
|
145 |
+
|
146 |
+
### Execution
|
147 |
+
|
148 |
+
The script is executed by calling the `main` function if the script is run as the main module.
|
149 |
+
|
150 |
+
```python
|
151 |
+
if __name__ == "__main__":
|
152 |
+
main()"""
|
153 |
+
|
154 |
+
st.write("## Instructions")
|
155 |
+
st.write("Do not run this code on Huggingface. Donwload the code and run it on your local machine.")
|
156 |
+
st.write("Make sure you have the required files in the data/asl_data folder.")
|
157 |
+
st.stop()
|
158 |
+
|
159 |
+
with st.expander("About", expanded=True):
|
160 |
+
st.write (about)
|
161 |
+
|
162 |
+
train_df = pd.read_csv("data/asl_data/sign_mnist_train.csv")
|
163 |
+
y_train = train_df.pop('label').values
|
164 |
+
x_train = train_df.values
|
165 |
+
train_dataset = load_and_convert_to_hf_dataset(x_train, y_train, "train")
|
166 |
+
|
167 |
+
valid_df = pd.read_csv("data/asl_data/sign_mnist_valid.csv")
|
168 |
+
y_valid = valid_df.pop('label').values
|
169 |
+
x_valid = valid_df.values
|
170 |
+
valid_dataset = load_and_convert_to_hf_dataset(x_valid, y_valid, "validation")
|
171 |
+
|
172 |
+
# Create a DatasetDict
|
173 |
+
full_dataset = DatasetDict({
|
174 |
+
"train": train_dataset,
|
175 |
+
"validation": valid_dataset
|
176 |
+
})
|
177 |
+
|
178 |
+
upload_dataset_to_hub(full_dataset, REPO_ID) # Upload the DatasetDict
|
179 |
+
|
180 |
+
st.write("Data upload complete.")
|
181 |
+
|
182 |
+
if __name__ == "__main__":
|
183 |
+
main()
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
torchvision
|
3 |
+
pandas
|
4 |
+
matplotlib
|
5 |
+
datasets
|