taziksh commited on
Commit
9c5ec50
·
verified ·
1 Parent(s): 48e8586

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. .gitignore +24 -0
  2. README.md +3 -9
  3. app.py +268 -0
  4. requirements.txt +8 -0
.gitignore ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ignore the virtual environment
2
+ derm/
3
+
4
+ # Ignore model weights & large binary files
5
+ model.h5
6
+ scin_dataset_precomputed_embeddings.npz
7
+
8
+ # Ignore system files
9
+ __pycache__/
10
+ *.pyc
11
+ *.pyo
12
+ .DS_Store
13
+
14
+ # Ignore logs and temporary files
15
+ logs/
16
+ *.log
17
+ *.out
18
+ *.err
19
+ .env
20
+
21
+ # Ignore IDE/Editor files
22
+ .vscode/
23
+ .idea/
24
+ *.swp
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
- title: Derm Foundation
3
- emoji: 🔥
4
- colorFrom: blue
5
- colorTo: pink
6
- sdk: gradio
7
- sdk_version: 5.14.0
8
  app_file: app.py
9
- pinned: false
 
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: derm-foundation
 
 
 
 
 
3
  app_file: app.py
4
+ sdk: gradio
5
+ sdk_version: 4.44.1
6
  ---
 
 
app.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ import os
3
+ import io
4
+ import numpy as np
5
+ import pandas as pd
6
+ import tensorflow as tf
7
+ from tensorflow.keras import layers, regularizers
8
+ from sklearn.preprocessing import MultiLabelBinarizer
9
+ from sklearn.model_selection import train_test_split
10
+ from google.cloud import storage
11
+ from huggingface_hub import hf_hub_download, notebook_login, login
12
+ from PIL import Image
13
+ import gradio as gr
14
+ import collections
15
+ from dotenv import load_dotenv
16
+
17
+ # Load environment variables from .env file
18
+ load_dotenv()
19
+
20
+ # Access and validate HF token
21
+ hf_token = os.getenv('HF_TOKEN')
22
+ if hf_token:
23
+ login(token=hf_token)
24
+ else:
25
+ # Check if token exists in default location
26
+ token_path = os.path.expanduser('~/.huggingface/token')
27
+ if os.path.exists(token_path):
28
+ with open(token_path) as f:
29
+ login(token=f.read().strip())
30
+ else:
31
+ print("Please set HF_TOKEN environment variable or store your token in ~/.huggingface/token")
32
+ exit(1)
33
+
34
+ # ======================
35
+ # CONSTANTS & CONFIGURATION
36
+ # ======================
37
+
38
+ SCIN_GCP_PROJECT = 'dx-scin-public'
39
+ SCIN_GCS_BUCKET_NAME = 'dx-scin-public-data'
40
+ SCIN_GCS_CASES_CSV = 'dataset/scin_cases.csv'
41
+ SCIN_GCS_LABELS_CSV = 'dataset/scin_labels.csv'
42
+
43
+ SCIN_HF_MODEL_NAME = 'google/derm-foundation'
44
+ SCIN_HF_EMBEDDING_FILE = 'scin_dataset_precomputed_embeddings.npz'
45
+
46
+ # The 10 conditions we want to predict
47
+ CONDITIONS_TO_PREDICT = [
48
+ 'Eczema',
49
+ 'Allergic Contact Dermatitis',
50
+ 'Insect Bite',
51
+ 'Urticaria',
52
+ 'Psoriasis',
53
+ 'Folliculitis',
54
+ 'Irritant Contact Dermatitis',
55
+ 'Tinea',
56
+ 'Herpes Zoster',
57
+ 'Drug Rash'
58
+ ]
59
+
60
+ # ======================
61
+ # HELPER FUNCTIONS FOR DATA LOADING
62
+ # ======================
63
+
64
+ def initialize_df_with_metadata(bucket, csv_path):
65
+ csv_bytes = bucket.blob(csv_path).download_as_string()
66
+ df = pd.read_csv(io.BytesIO(csv_bytes), dtype={'case_id': str})
67
+ df['case_id'] = df['case_id'].astype(str)
68
+ return df
69
+
70
+ def augment_metadata_with_labels(df, bucket, csv_path):
71
+ csv_bytes = bucket.blob(csv_path).download_as_string()
72
+ labels_df = pd.read_csv(io.BytesIO(csv_bytes), dtype={'case_id': str})
73
+ labels_df['case_id'] = labels_df['case_id'].astype(str)
74
+ merged_df = pd.merge(df, labels_df, on='case_id')
75
+ return merged_df
76
+
77
+ def load_embeddings_from_file(repo_id, object_name):
78
+ file_path = hf_hub_download(repo_id=repo_id, filename=object_name, local_dir='./')
79
+ embeddings = {}
80
+ with open(file_path, 'rb') as f:
81
+ npz_file = np.load(f, allow_pickle=True)
82
+ for key, value in npz_file.items():
83
+ embeddings[key] = value
84
+ return embeddings
85
+
86
+ # ======================
87
+ # DATA PREPARATION FUNCTION
88
+ # ======================
89
+
90
+ def prepare_data(df, embeddings):
91
+ MINIMUM_CONFIDENCE = 0 # Adjust this if needed.
92
+ X = []
93
+ y = []
94
+ poor_image_quality_counter = 0
95
+ missing_embedding_counter = 0
96
+ not_in_condition_counter = 0
97
+ condition_confidence_low_counter = 0
98
+
99
+ for row in df.itertuples():
100
+ # Check if the image is marked as having sufficient quality.
101
+ if getattr(row, 'dermatologist_gradable_for_skin_condition_1', None) != 'DEFAULT_YES_IMAGE_QUALITY_SUFFICIENT':
102
+ poor_image_quality_counter += 1
103
+ continue
104
+
105
+ # Parse the labels and confidences.
106
+ try:
107
+ labels = eval(getattr(row, 'dermatologist_skin_condition_on_label_name', '[]'))
108
+ confidences = eval(getattr(row, 'dermatologist_skin_condition_confidence', '[]'))
109
+ except Exception as e:
110
+ continue
111
+
112
+ row_labels = []
113
+ for label, conf in zip(labels, confidences):
114
+ if label not in CONDITIONS_TO_PREDICT:
115
+ not_in_condition_counter += 1
116
+ continue
117
+ if conf < MINIMUM_CONFIDENCE:
118
+ condition_confidence_low_counter += 1
119
+ continue
120
+ row_labels.append(label)
121
+
122
+ # For each image associated with this case, add its embedding and labels.
123
+ for image_path in [getattr(row, 'image_1_path', None),
124
+ getattr(row, 'image_2_path', None),
125
+ getattr(row, 'image_3_path', None)]:
126
+ if pd.isna(image_path) or image_path is None:
127
+ continue
128
+ if image_path not in embeddings:
129
+ missing_embedding_counter += 1
130
+ continue
131
+ X.append(embeddings[image_path])
132
+ y.append(row_labels)
133
+
134
+ print(f'Poor image quality count: {poor_image_quality_counter}')
135
+ print(f'Missing embedding count: {missing_embedding_counter}')
136
+ print(f'Condition not in list count: {not_in_condition_counter}')
137
+ print(f'Excluded due to low confidence count: {condition_confidence_low_counter}')
138
+ return X, y
139
+
140
+ # ======================
141
+ # MODEL BUILDING FUNCTION
142
+ # ======================
143
+
144
+ def build_model(input_dim, output_dim, weight_decay=1e-4):
145
+ inputs = tf.keras.Input(shape=(input_dim,))
146
+ hidden = layers.Dense(256, activation="relu",
147
+ kernel_regularizer=regularizers.l2(weight_decay),
148
+ bias_regularizer=regularizers.l2(weight_decay))(inputs)
149
+ hidden = layers.Dropout(0.1)(hidden)
150
+ hidden = layers.Dense(128, activation="relu",
151
+ kernel_regularizer=regularizers.l2(weight_decay),
152
+ bias_regularizer=regularizers.l2(weight_decay))(hidden)
153
+ hidden = layers.Dropout(0.1)(hidden)
154
+ output = layers.Dense(output_dim, activation="sigmoid")(hidden)
155
+ model = tf.keras.Model(inputs, output)
156
+ model.compile(loss="binary_crossentropy",
157
+ optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4))
158
+ return model
159
+
160
+ # ======================
161
+ # MAIN FUNCTION & GRADIO INTERFACE
162
+ # ======================
163
+
164
+ def main():
165
+ # Connect to the Google Cloud Storage bucket.
166
+ storage_client = storage.Client(SCIN_GCP_PROJECT)
167
+ bucket = storage_client.bucket(SCIN_GCS_BUCKET_NAME)
168
+
169
+ # Load SCIN dataset CSVs and merge them.
170
+ df_cases = initialize_df_with_metadata(bucket, SCIN_GCS_CASES_CSV)
171
+ df_full = augment_metadata_with_labels(df_cases, bucket, SCIN_GCS_LABELS_CSV)
172
+ df_full.set_index('case_id', inplace=True)
173
+
174
+ # Load precomputed embeddings from Hugging Face.
175
+ print("Loading embeddings...")
176
+ embeddings = load_embeddings_from_file(SCIN_HF_MODEL_NAME, SCIN_HF_EMBEDDING_FILE)
177
+
178
+ # Prepare the training data.
179
+ print("Preparing training data...")
180
+ X, y = prepare_data(df_full, embeddings)
181
+ X = np.array(X)
182
+ # Convert the list of label lists to binary arrays.
183
+ mlb = MultiLabelBinarizer(classes=CONDITIONS_TO_PREDICT)
184
+ y_bin = mlb.fit_transform(y)
185
+
186
+ # Split the data into train and test sets.
187
+ X_train, X_test, y_train, y_test = train_test_split(X, y_bin, test_size=0.2, random_state=42)
188
+
189
+ # Build the model.
190
+ model = build_model(input_dim=6144, output_dim=len(CONDITIONS_TO_PREDICT))
191
+
192
+ # If a saved model exists, load it; otherwise, train and save it.
193
+ model_file = "model.h5"
194
+ if os.path.exists(model_file):
195
+ print("Loading existing model from", model_file)
196
+ model = tf.keras.models.load_model(model_file)
197
+ else:
198
+ print("Training model... This may take a few minutes.")
199
+ train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(32)
200
+ test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(32)
201
+ model.fit(train_ds, validation_data=test_ds, epochs=15)
202
+ model.save(model_file)
203
+
204
+ # Extract a list of case IDs for dropdown
205
+ case_ids = list(df_full.index)
206
+
207
+ def predict_case(case_id: str):
208
+ """Fetch images and predictions for a given case ID."""
209
+ if case_id not in df_full.index:
210
+ return [], "Case ID not found!", "N/A", "N/A"
211
+
212
+ row = df_full.loc[case_id]
213
+ image_paths = [row.get('image_1_path'), row.get('image_2_path'), row.get('image_3_path')]
214
+ images, predictions_text = [], []
215
+
216
+ # Get Dermatologist's Labels
217
+ dermatologist_conditions = row.get('dermatologist_skin_condition_on_label_name', "N/A")
218
+ dermatologist_confidence = row.get('dermatologist_skin_condition_confidence', "N/A")
219
+
220
+ if isinstance(dermatologist_conditions, str):
221
+ try:
222
+ dermatologist_conditions = eval(dermatologist_conditions)
223
+ dermatologist_confidence = eval(dermatologist_confidence)
224
+ except:
225
+ pass
226
+
227
+ # Process images & generate predictions
228
+ for path in image_paths:
229
+ if isinstance(path, str) and (path in embeddings):
230
+ try:
231
+ img_bytes = bucket.blob(path).download_as_string()
232
+ img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
233
+ images.append(img)
234
+ except:
235
+ continue
236
+
237
+ # Model Prediction
238
+ emb = np.expand_dims(embeddings[path], axis=0)
239
+ pred = model.predict(emb)[0]
240
+ pred_dict = {cond: round(float(prob), 3) for cond, prob in zip(mlb.classes_, pred)}
241
+ predictions_text.append(str(pred_dict))
242
+
243
+ # Format the output
244
+ predictions_text = "\n".join(predictions_text) if predictions_text else "No predictions available."
245
+ dermatologist_conditions = str(dermatologist_conditions)
246
+ dermatologist_confidence = str(dermatologist_confidence)
247
+
248
+ return images, predictions_text, dermatologist_conditions, dermatologist_confidence
249
+
250
+ # Create the Gradio Interface with a Dropdown
251
+ iface = gr.Interface(
252
+ fn=predict_case,
253
+ inputs=gr.Dropdown(choices=case_ids, label="Select a Case ID"),
254
+ outputs=[
255
+ gr.Gallery(label="Case Images"),
256
+ gr.Textbox(label="Model's Predictions"),
257
+ gr.Textbox(label="Dermatologist's Skin Conditions"),
258
+ gr.Textbox(label="Dermatologist's Confidence Ratings")
259
+ ],
260
+ title="Derm Foundation Skin Conditions Explorer",
261
+ description="Select a Case ID from the dropdown to view images and predictions."
262
+ )
263
+
264
+ iface.launch(share=True)
265
+
266
+
267
+ if __name__ == "__main__":
268
+ main()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ tensorflow
3
+ numpy
4
+ pandas
5
+ scikit-learn
6
+ google-cloud-storage
7
+ huggingface_hub
8
+ pillow