Spaces:
Runtime error
Runtime error
Upload folder using huggingface_hub
Browse files- .gitignore +24 -0
- README.md +3 -9
- app.py +268 -0
- requirements.txt +8 -0
.gitignore
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Ignore the virtual environment
|
2 |
+
derm/
|
3 |
+
|
4 |
+
# Ignore model weights & large binary files
|
5 |
+
model.h5
|
6 |
+
scin_dataset_precomputed_embeddings.npz
|
7 |
+
|
8 |
+
# Ignore system files
|
9 |
+
__pycache__/
|
10 |
+
*.pyc
|
11 |
+
*.pyo
|
12 |
+
.DS_Store
|
13 |
+
|
14 |
+
# Ignore logs and temporary files
|
15 |
+
logs/
|
16 |
+
*.log
|
17 |
+
*.out
|
18 |
+
*.err
|
19 |
+
.env
|
20 |
+
|
21 |
+
# Ignore IDE/Editor files
|
22 |
+
.vscode/
|
23 |
+
.idea/
|
24 |
+
*.swp
|
README.md
CHANGED
@@ -1,12 +1,6 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji: 🔥
|
4 |
-
colorFrom: blue
|
5 |
-
colorTo: pink
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 5.14.0
|
8 |
app_file: app.py
|
9 |
-
|
|
|
10 |
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: derm-foundation
|
|
|
|
|
|
|
|
|
|
|
3 |
app_file: app.py
|
4 |
+
sdk: gradio
|
5 |
+
sdk_version: 4.44.1
|
6 |
---
|
|
|
|
app.py
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
import os
|
3 |
+
import io
|
4 |
+
import numpy as np
|
5 |
+
import pandas as pd
|
6 |
+
import tensorflow as tf
|
7 |
+
from tensorflow.keras import layers, regularizers
|
8 |
+
from sklearn.preprocessing import MultiLabelBinarizer
|
9 |
+
from sklearn.model_selection import train_test_split
|
10 |
+
from google.cloud import storage
|
11 |
+
from huggingface_hub import hf_hub_download, notebook_login, login
|
12 |
+
from PIL import Image
|
13 |
+
import gradio as gr
|
14 |
+
import collections
|
15 |
+
from dotenv import load_dotenv
|
16 |
+
|
17 |
+
# Load environment variables from .env file
|
18 |
+
load_dotenv()
|
19 |
+
|
20 |
+
# Access and validate HF token
|
21 |
+
hf_token = os.getenv('HF_TOKEN')
|
22 |
+
if hf_token:
|
23 |
+
login(token=hf_token)
|
24 |
+
else:
|
25 |
+
# Check if token exists in default location
|
26 |
+
token_path = os.path.expanduser('~/.huggingface/token')
|
27 |
+
if os.path.exists(token_path):
|
28 |
+
with open(token_path) as f:
|
29 |
+
login(token=f.read().strip())
|
30 |
+
else:
|
31 |
+
print("Please set HF_TOKEN environment variable or store your token in ~/.huggingface/token")
|
32 |
+
exit(1)
|
33 |
+
|
34 |
+
# ======================
|
35 |
+
# CONSTANTS & CONFIGURATION
|
36 |
+
# ======================
|
37 |
+
|
38 |
+
SCIN_GCP_PROJECT = 'dx-scin-public'
|
39 |
+
SCIN_GCS_BUCKET_NAME = 'dx-scin-public-data'
|
40 |
+
SCIN_GCS_CASES_CSV = 'dataset/scin_cases.csv'
|
41 |
+
SCIN_GCS_LABELS_CSV = 'dataset/scin_labels.csv'
|
42 |
+
|
43 |
+
SCIN_HF_MODEL_NAME = 'google/derm-foundation'
|
44 |
+
SCIN_HF_EMBEDDING_FILE = 'scin_dataset_precomputed_embeddings.npz'
|
45 |
+
|
46 |
+
# The 10 conditions we want to predict
|
47 |
+
CONDITIONS_TO_PREDICT = [
|
48 |
+
'Eczema',
|
49 |
+
'Allergic Contact Dermatitis',
|
50 |
+
'Insect Bite',
|
51 |
+
'Urticaria',
|
52 |
+
'Psoriasis',
|
53 |
+
'Folliculitis',
|
54 |
+
'Irritant Contact Dermatitis',
|
55 |
+
'Tinea',
|
56 |
+
'Herpes Zoster',
|
57 |
+
'Drug Rash'
|
58 |
+
]
|
59 |
+
|
60 |
+
# ======================
|
61 |
+
# HELPER FUNCTIONS FOR DATA LOADING
|
62 |
+
# ======================
|
63 |
+
|
64 |
+
def initialize_df_with_metadata(bucket, csv_path):
|
65 |
+
csv_bytes = bucket.blob(csv_path).download_as_string()
|
66 |
+
df = pd.read_csv(io.BytesIO(csv_bytes), dtype={'case_id': str})
|
67 |
+
df['case_id'] = df['case_id'].astype(str)
|
68 |
+
return df
|
69 |
+
|
70 |
+
def augment_metadata_with_labels(df, bucket, csv_path):
|
71 |
+
csv_bytes = bucket.blob(csv_path).download_as_string()
|
72 |
+
labels_df = pd.read_csv(io.BytesIO(csv_bytes), dtype={'case_id': str})
|
73 |
+
labels_df['case_id'] = labels_df['case_id'].astype(str)
|
74 |
+
merged_df = pd.merge(df, labels_df, on='case_id')
|
75 |
+
return merged_df
|
76 |
+
|
77 |
+
def load_embeddings_from_file(repo_id, object_name):
|
78 |
+
file_path = hf_hub_download(repo_id=repo_id, filename=object_name, local_dir='./')
|
79 |
+
embeddings = {}
|
80 |
+
with open(file_path, 'rb') as f:
|
81 |
+
npz_file = np.load(f, allow_pickle=True)
|
82 |
+
for key, value in npz_file.items():
|
83 |
+
embeddings[key] = value
|
84 |
+
return embeddings
|
85 |
+
|
86 |
+
# ======================
|
87 |
+
# DATA PREPARATION FUNCTION
|
88 |
+
# ======================
|
89 |
+
|
90 |
+
def prepare_data(df, embeddings):
|
91 |
+
MINIMUM_CONFIDENCE = 0 # Adjust this if needed.
|
92 |
+
X = []
|
93 |
+
y = []
|
94 |
+
poor_image_quality_counter = 0
|
95 |
+
missing_embedding_counter = 0
|
96 |
+
not_in_condition_counter = 0
|
97 |
+
condition_confidence_low_counter = 0
|
98 |
+
|
99 |
+
for row in df.itertuples():
|
100 |
+
# Check if the image is marked as having sufficient quality.
|
101 |
+
if getattr(row, 'dermatologist_gradable_for_skin_condition_1', None) != 'DEFAULT_YES_IMAGE_QUALITY_SUFFICIENT':
|
102 |
+
poor_image_quality_counter += 1
|
103 |
+
continue
|
104 |
+
|
105 |
+
# Parse the labels and confidences.
|
106 |
+
try:
|
107 |
+
labels = eval(getattr(row, 'dermatologist_skin_condition_on_label_name', '[]'))
|
108 |
+
confidences = eval(getattr(row, 'dermatologist_skin_condition_confidence', '[]'))
|
109 |
+
except Exception as e:
|
110 |
+
continue
|
111 |
+
|
112 |
+
row_labels = []
|
113 |
+
for label, conf in zip(labels, confidences):
|
114 |
+
if label not in CONDITIONS_TO_PREDICT:
|
115 |
+
not_in_condition_counter += 1
|
116 |
+
continue
|
117 |
+
if conf < MINIMUM_CONFIDENCE:
|
118 |
+
condition_confidence_low_counter += 1
|
119 |
+
continue
|
120 |
+
row_labels.append(label)
|
121 |
+
|
122 |
+
# For each image associated with this case, add its embedding and labels.
|
123 |
+
for image_path in [getattr(row, 'image_1_path', None),
|
124 |
+
getattr(row, 'image_2_path', None),
|
125 |
+
getattr(row, 'image_3_path', None)]:
|
126 |
+
if pd.isna(image_path) or image_path is None:
|
127 |
+
continue
|
128 |
+
if image_path not in embeddings:
|
129 |
+
missing_embedding_counter += 1
|
130 |
+
continue
|
131 |
+
X.append(embeddings[image_path])
|
132 |
+
y.append(row_labels)
|
133 |
+
|
134 |
+
print(f'Poor image quality count: {poor_image_quality_counter}')
|
135 |
+
print(f'Missing embedding count: {missing_embedding_counter}')
|
136 |
+
print(f'Condition not in list count: {not_in_condition_counter}')
|
137 |
+
print(f'Excluded due to low confidence count: {condition_confidence_low_counter}')
|
138 |
+
return X, y
|
139 |
+
|
140 |
+
# ======================
|
141 |
+
# MODEL BUILDING FUNCTION
|
142 |
+
# ======================
|
143 |
+
|
144 |
+
def build_model(input_dim, output_dim, weight_decay=1e-4):
|
145 |
+
inputs = tf.keras.Input(shape=(input_dim,))
|
146 |
+
hidden = layers.Dense(256, activation="relu",
|
147 |
+
kernel_regularizer=regularizers.l2(weight_decay),
|
148 |
+
bias_regularizer=regularizers.l2(weight_decay))(inputs)
|
149 |
+
hidden = layers.Dropout(0.1)(hidden)
|
150 |
+
hidden = layers.Dense(128, activation="relu",
|
151 |
+
kernel_regularizer=regularizers.l2(weight_decay),
|
152 |
+
bias_regularizer=regularizers.l2(weight_decay))(hidden)
|
153 |
+
hidden = layers.Dropout(0.1)(hidden)
|
154 |
+
output = layers.Dense(output_dim, activation="sigmoid")(hidden)
|
155 |
+
model = tf.keras.Model(inputs, output)
|
156 |
+
model.compile(loss="binary_crossentropy",
|
157 |
+
optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4))
|
158 |
+
return model
|
159 |
+
|
160 |
+
# ======================
|
161 |
+
# MAIN FUNCTION & GRADIO INTERFACE
|
162 |
+
# ======================
|
163 |
+
|
164 |
+
def main():
|
165 |
+
# Connect to the Google Cloud Storage bucket.
|
166 |
+
storage_client = storage.Client(SCIN_GCP_PROJECT)
|
167 |
+
bucket = storage_client.bucket(SCIN_GCS_BUCKET_NAME)
|
168 |
+
|
169 |
+
# Load SCIN dataset CSVs and merge them.
|
170 |
+
df_cases = initialize_df_with_metadata(bucket, SCIN_GCS_CASES_CSV)
|
171 |
+
df_full = augment_metadata_with_labels(df_cases, bucket, SCIN_GCS_LABELS_CSV)
|
172 |
+
df_full.set_index('case_id', inplace=True)
|
173 |
+
|
174 |
+
# Load precomputed embeddings from Hugging Face.
|
175 |
+
print("Loading embeddings...")
|
176 |
+
embeddings = load_embeddings_from_file(SCIN_HF_MODEL_NAME, SCIN_HF_EMBEDDING_FILE)
|
177 |
+
|
178 |
+
# Prepare the training data.
|
179 |
+
print("Preparing training data...")
|
180 |
+
X, y = prepare_data(df_full, embeddings)
|
181 |
+
X = np.array(X)
|
182 |
+
# Convert the list of label lists to binary arrays.
|
183 |
+
mlb = MultiLabelBinarizer(classes=CONDITIONS_TO_PREDICT)
|
184 |
+
y_bin = mlb.fit_transform(y)
|
185 |
+
|
186 |
+
# Split the data into train and test sets.
|
187 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y_bin, test_size=0.2, random_state=42)
|
188 |
+
|
189 |
+
# Build the model.
|
190 |
+
model = build_model(input_dim=6144, output_dim=len(CONDITIONS_TO_PREDICT))
|
191 |
+
|
192 |
+
# If a saved model exists, load it; otherwise, train and save it.
|
193 |
+
model_file = "model.h5"
|
194 |
+
if os.path.exists(model_file):
|
195 |
+
print("Loading existing model from", model_file)
|
196 |
+
model = tf.keras.models.load_model(model_file)
|
197 |
+
else:
|
198 |
+
print("Training model... This may take a few minutes.")
|
199 |
+
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(32)
|
200 |
+
test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(32)
|
201 |
+
model.fit(train_ds, validation_data=test_ds, epochs=15)
|
202 |
+
model.save(model_file)
|
203 |
+
|
204 |
+
# Extract a list of case IDs for dropdown
|
205 |
+
case_ids = list(df_full.index)
|
206 |
+
|
207 |
+
def predict_case(case_id: str):
|
208 |
+
"""Fetch images and predictions for a given case ID."""
|
209 |
+
if case_id not in df_full.index:
|
210 |
+
return [], "Case ID not found!", "N/A", "N/A"
|
211 |
+
|
212 |
+
row = df_full.loc[case_id]
|
213 |
+
image_paths = [row.get('image_1_path'), row.get('image_2_path'), row.get('image_3_path')]
|
214 |
+
images, predictions_text = [], []
|
215 |
+
|
216 |
+
# Get Dermatologist's Labels
|
217 |
+
dermatologist_conditions = row.get('dermatologist_skin_condition_on_label_name', "N/A")
|
218 |
+
dermatologist_confidence = row.get('dermatologist_skin_condition_confidence', "N/A")
|
219 |
+
|
220 |
+
if isinstance(dermatologist_conditions, str):
|
221 |
+
try:
|
222 |
+
dermatologist_conditions = eval(dermatologist_conditions)
|
223 |
+
dermatologist_confidence = eval(dermatologist_confidence)
|
224 |
+
except:
|
225 |
+
pass
|
226 |
+
|
227 |
+
# Process images & generate predictions
|
228 |
+
for path in image_paths:
|
229 |
+
if isinstance(path, str) and (path in embeddings):
|
230 |
+
try:
|
231 |
+
img_bytes = bucket.blob(path).download_as_string()
|
232 |
+
img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
|
233 |
+
images.append(img)
|
234 |
+
except:
|
235 |
+
continue
|
236 |
+
|
237 |
+
# Model Prediction
|
238 |
+
emb = np.expand_dims(embeddings[path], axis=0)
|
239 |
+
pred = model.predict(emb)[0]
|
240 |
+
pred_dict = {cond: round(float(prob), 3) for cond, prob in zip(mlb.classes_, pred)}
|
241 |
+
predictions_text.append(str(pred_dict))
|
242 |
+
|
243 |
+
# Format the output
|
244 |
+
predictions_text = "\n".join(predictions_text) if predictions_text else "No predictions available."
|
245 |
+
dermatologist_conditions = str(dermatologist_conditions)
|
246 |
+
dermatologist_confidence = str(dermatologist_confidence)
|
247 |
+
|
248 |
+
return images, predictions_text, dermatologist_conditions, dermatologist_confidence
|
249 |
+
|
250 |
+
# Create the Gradio Interface with a Dropdown
|
251 |
+
iface = gr.Interface(
|
252 |
+
fn=predict_case,
|
253 |
+
inputs=gr.Dropdown(choices=case_ids, label="Select a Case ID"),
|
254 |
+
outputs=[
|
255 |
+
gr.Gallery(label="Case Images"),
|
256 |
+
gr.Textbox(label="Model's Predictions"),
|
257 |
+
gr.Textbox(label="Dermatologist's Skin Conditions"),
|
258 |
+
gr.Textbox(label="Dermatologist's Confidence Ratings")
|
259 |
+
],
|
260 |
+
title="Derm Foundation Skin Conditions Explorer",
|
261 |
+
description="Select a Case ID from the dropdown to view images and predictions."
|
262 |
+
)
|
263 |
+
|
264 |
+
iface.launch(share=True)
|
265 |
+
|
266 |
+
|
267 |
+
if __name__ == "__main__":
|
268 |
+
main()
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
tensorflow
|
3 |
+
numpy
|
4 |
+
pandas
|
5 |
+
scikit-learn
|
6 |
+
google-cloud-storage
|
7 |
+
huggingface_hub
|
8 |
+
pillow
|