louiecerv commited on
Commit
19b0f3b
·
1 Parent(s): e22e88d

sync with remote

Browse files
Files changed (3) hide show
  1. .gitignore +1 -0
  2. app.py +183 -0
  3. requirements.txt +5 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ data/
app.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # data_uploader.py
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import numpy as np
5
+ from PIL import Image
6
+ import torch
7
+ from torchvision import transforms
8
+ from torch.utils.data import Dataset, DataLoader
9
+ from datasets import Dataset as HFDataset, DatasetDict
10
+ from huggingface_hub import HfApi # For Hugging Face Hub interaction
11
+ import os
12
+
13
+ # Hugging Face Hub credentials
14
+ HF_TOKEN = os.getenv("HF_TOKEN")
15
+ REPO_ID = "louiecerv/american_sign_language" # Replace with your dataset repo name
16
+
17
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
+ st.write(f"Enabled GPU = {torch.cuda.is_available()}")
19
+
20
+ class MyDataset(Dataset):
21
+ def __init__(self, x_df, y_df):
22
+ self.xs = torch.tensor(x_df, dtype=torch.float32).to(device) # Explicitly set dtype
23
+ self.ys = torch.tensor(y_df, dtype=torch.long).to(device) # Explicitly set dtype
24
+
25
+ def __getitem__(self, idx):
26
+ x = self.xs[idx]
27
+ y = self.ys[idx]
28
+ return x, y
29
+
30
+ def __len__(self):
31
+ return len(self.xs)
32
+
33
+ # Load the dataset and convert to Hugging Face Dataset
34
+ def load_and_convert_to_hf_dataset(x, y, split="train"):
35
+ df = pd.DataFrame({"image": list(x), "label": y}) # Create a DataFrame
36
+ hf_dataset = HFDataset.from_pandas(df)
37
+
38
+ # Preprocess images (Important for Hugging Face)
39
+ def preprocess_function(examples):
40
+ images = [np.array(img).reshape(28, 28) for img in examples["image"]] #Reshape the image
41
+ # Convert to PIL images and apply transformations
42
+ transformed_images = []
43
+ for image in images:
44
+ image = Image.fromarray(image.astype('uint8'))
45
+ transform = transforms.Compose([
46
+ transforms.Grayscale(num_output_channels=1),
47
+ transforms.Resize((28, 28)),
48
+ transforms.ToTensor(),
49
+ transforms.Normalize(mean=[0.5], std=[0.5])
50
+ ])
51
+ transformed_image = transform(image)
52
+ transformed_images.append(transformed_image)
53
+ examples["pixel_values"] = torch.stack(transformed_images) #Stack the images into a tensor
54
+ return examples
55
+
56
+ hf_dataset = hf_dataset.map(preprocess_function, batched=True, remove_columns=["image"])
57
+ hf_dataset.set_format("torch") # Set format to PyTorch
58
+ return hf_dataset
59
+
60
+ # Load and convert dataframes to Hugging Face datasets
61
+ train_df = pd.read_csv("data/asl_data/sign_mnist_train.csv")
62
+ y_train = train_df.pop('label').values
63
+ x_train = train_df.values
64
+
65
+ valid_df = pd.read_csv("data/asl_data/sign_mnist_valid.csv")
66
+ y_valid = valid_df.pop('label').values
67
+ x_valid = valid_df.values
68
+
69
+ def upload_dataset_to_hub(dataset, repo_id):
70
+ api = HfApi(token=HF_TOKEN)
71
+ api.create_repo(repo_id, repo_type="dataset", exist_ok=True) # Create repo if it doesn't exist
72
+ dataset.push_to_hub(repo_id)
73
+ print(f"Dataset uploaded to {repo_id}")
74
+
75
+
76
+ def main():
77
+ st.title("American Sign Language Dataset Uploader")
78
+
79
+ about = """
80
+ ## About This App
81
+
82
+ This app is designed to load, preprocess, and upload datasets to the Hugging Face Hub. The main functionalities are encapsulated in the following components:
83
+
84
+ ### Custom Dataset Class
85
+
86
+ The `MyDataset` class inherits from `torch.utils.data.Dataset` and is used to handle the dataset.
87
+
88
+ - **Initialization (`__init__`)**:
89
+ - Converts input dataframes `x_df` and `y_df` to PyTorch tensors with explicit data types (`float32` for features and `long` for labels).
90
+ - Moves the tensors to the specified device (e.g., GPU).
91
+
92
+ - **Get Item (`__getitem__`)**:
93
+ - Retrieves the feature (`x`) and label (`y`) tensors at a given index `idx`.
94
+
95
+ - **Length (`__len__`)**:
96
+ - Returns the length of the dataset.
97
+
98
+ ### Load and Convert to Hugging Face Dataset
99
+
100
+ The `load_and_convert_to_hf_dataset` function converts input data into a Hugging Face dataset.
101
+
102
+ - **DataFrame Creation**:
103
+ - Creates a Pandas DataFrame from the input features (`x`) and labels (`y`).
104
+
105
+ - **Preprocessing Function**:
106
+ - Reshapes images to 28x28 pixels.
107
+ - Converts images to PIL format and applies transformations (grayscale, resize, tensor conversion, and normalization).
108
+ - Stacks the transformed images into a tensor.
109
+
110
+ - **Dataset Mapping**:
111
+ - Applies the preprocessing function to the dataset.
112
+ - Sets the dataset format to PyTorch.
113
+
114
+ ### Data Loading and Conversion
115
+
116
+ The app loads training and validation data from CSV files and converts them into Hugging Face datasets.
117
+
118
+ - **Training Data**:
119
+ - Loads data from `sign_mnist_train.csv`.
120
+ - Separates features and labels.
121
+ - Converts to a Hugging Face dataset.
122
+
123
+ - **Validation Data**:
124
+ - Loads data from `sign_mnist_valid.csv`.
125
+ - Separates features and labels.
126
+ - Converts to a Hugging Face dataset.
127
+
128
+ ### Upload Dataset to Hugging Face Hub
129
+
130
+ The `upload_dataset_to_hub` function uploads the dataset to the Hugging Face Hub.
131
+
132
+ - **Repository Creation**:
133
+ - Creates a repository if it doesn't exist.
134
+
135
+ - **Dataset Upload**:
136
+ - Pushes the dataset to the specified repository.
137
+
138
+ ### Main Function
139
+
140
+ The `main` function orchestrates the entire process.
141
+
142
+ - Loads and preprocesses training and validation data.
143
+ - Creates a `DatasetDict` containing both datasets.
144
+ - Uploads the dataset to the Hugging Face Hub.
145
+
146
+ ### Execution
147
+
148
+ The script is executed by calling the `main` function if the script is run as the main module.
149
+
150
+ ```python
151
+ if __name__ == "__main__":
152
+ main()"""
153
+
154
+ st.write("## Instructions")
155
+ st.write("Do not run this code on Huggingface. Donwload the code and run it on your local machine.")
156
+ st.write("Make sure you have the required files in the data/asl_data folder.")
157
+ st.stop()
158
+
159
+ with st.expander("About", expanded=True):
160
+ st.write (about)
161
+
162
+ train_df = pd.read_csv("data/asl_data/sign_mnist_train.csv")
163
+ y_train = train_df.pop('label').values
164
+ x_train = train_df.values
165
+ train_dataset = load_and_convert_to_hf_dataset(x_train, y_train, "train")
166
+
167
+ valid_df = pd.read_csv("data/asl_data/sign_mnist_valid.csv")
168
+ y_valid = valid_df.pop('label').values
169
+ x_valid = valid_df.values
170
+ valid_dataset = load_and_convert_to_hf_dataset(x_valid, y_valid, "validation")
171
+
172
+ # Create a DatasetDict
173
+ full_dataset = DatasetDict({
174
+ "train": train_dataset,
175
+ "validation": valid_dataset
176
+ })
177
+
178
+ upload_dataset_to_hub(full_dataset, REPO_ID) # Upload the DatasetDict
179
+
180
+ st.write("Data upload complete.")
181
+
182
+ if __name__ == "__main__":
183
+ main()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ torch
2
+ torchvision
3
+ pandas
4
+ matplotlib
5
+ datasets