Nonnormalizable commited on
Commit
b3f06b6
·
1 Parent(s): 79be168

API routes. black formatting.

Browse files
Finetune BERT.ipynb CHANGED
@@ -62,8 +62,9 @@
62
  " time_str = datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\")\n",
63
  " print(time_str, x)\n",
64
  "\n",
 
65
  "class BertClassifier(nn.Module, PyTorchModelHubMixin):\n",
66
- " def __init__(self, num_labels=8, bert_variety='bert-base-uncased'):\n",
67
  " super().__init__()\n",
68
  " self.bert = BertModel.from_pretrained(bert_variety)\n",
69
  " self.dropout = nn.Dropout(0.05)\n",
@@ -76,6 +77,7 @@
76
  " logits = self.classifier(pooled_output)\n",
77
  " return logits\n",
78
  "\n",
 
79
  "class TextDataset(Dataset):\n",
80
  " def __init__(self, texts, labels, tokenizer, max_length=512):\n",
81
  " self.encodings = tokenizer(\n",
@@ -83,32 +85,33 @@
83
  " truncation=True,\n",
84
  " padding=True,\n",
85
  " max_length=max_length,\n",
86
- " return_tensors='pt',\n",
87
  " )\n",
88
  " self.labels = torch.tensor([int(l[0]) for l in labels])\n",
89
  "\n",
90
  " def __getitem__(self, idx):\n",
91
  " item = {key: val[idx] for key, val in self.encodings.items()}\n",
92
- " item['labels'] = self.labels[idx]\n",
93
  " return item\n",
94
  "\n",
95
  " def __len__(self) -> int:\n",
96
  " return len(self.labels)\n",
97
  "\n",
 
98
  "def train_model(model, train_dataloader, device, num_epochs):\n",
99
  " optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)\n",
100
  " criterion = nn.CrossEntropyLoss()\n",
101
  " model.train()\n",
102
  "\n",
103
- " my_print('Starting epoch 1.')\n",
104
  " for epoch in range(num_epochs):\n",
105
  " total_loss = 0\n",
106
  " for batch in train_dataloader:\n",
107
  " optimizer.zero_grad()\n",
108
  "\n",
109
- " input_ids = batch['input_ids'].to(device)\n",
110
- " attention_mask = batch['attention_mask'].to(device)\n",
111
- " labels = batch['labels'].to(device)\n",
112
  "\n",
113
  " outputs = model(input_ids, attention_mask)\n",
114
  " loss = criterion(outputs, labels)\n",
@@ -118,7 +121,7 @@
118
  "\n",
119
  " total_loss += loss.item()\n",
120
  " avg_loss = total_loss / len(train_dataloader)\n",
121
- " my_print(f'Epoch {epoch+1}/{num_epochs} done, Average Loss: {avg_loss:0.4f}')"
122
  ]
123
  },
124
  {
@@ -137,12 +140,12 @@
137
  "outputs": [],
138
  "source": [
139
  "if torch.backends.mps.is_available():\n",
140
- " device = torch.device('mps')\n",
141
  " torch.mps.empty_cache()\n",
142
  "elif torch.cuda.is_available():\n",
143
- " device = torch.device('cuda')\n",
144
  "else:\n",
145
- " device = torch.device('cpu')"
146
  ]
147
  },
148
  {
@@ -162,36 +165,36 @@
162
  "source": [
163
  "def run_training(\n",
164
  " max_dataset_size=16 * 200,\n",
165
- " bert_variety='bert-base-uncased',\n",
166
  " max_length=200,\n",
167
  " num_epochs=3,\n",
168
  " batch_size=32,\n",
169
  "):\n",
170
  " hf_dataset = load_dataset(\"quotaclimat/frugalaichallenge-text-train\")\n",
171
- " if not max_dataset_size == 'full' and max_dataset_size < len(hf_dataset['train']):\n",
172
- " train_dataset = hf_dataset['train'][:max_dataset_size]\n",
173
  " else:\n",
174
- " train_dataset = hf_dataset['train']\n",
175
- " \n",
176
  " tokenizer = BertTokenizer.from_pretrained(bert_variety, max_length=max_length)\n",
177
  " model = BertClassifier(bert_variety=bert_variety)\n",
178
  " if torch.backends.mps.is_available():\n",
179
- " device = torch.device('mps')\n",
180
  " torch.mps.empty_cache()\n",
181
  " elif torch.cuda.is_available():\n",
182
- " device = torch.device('cuda')\n",
183
  " else:\n",
184
- " device = torch.device('cpu')\n",
185
  " model.to(device)\n",
186
- " \n",
187
  " dataset = TextDataset(\n",
188
- " train_dataset['quote'],\n",
189
- " train_dataset['label'],\n",
190
  " tokenizer=tokenizer,\n",
191
  " max_length=max_length,\n",
192
  " )\n",
193
  " dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)\n",
194
- " \n",
195
  " train_model(model, dataloader, device, num_epochs=num_epochs)\n",
196
  " return model, tokenizer"
197
  ]
@@ -224,7 +227,7 @@
224
  "source": [
225
  "model, tokenizer = run_training(\n",
226
  " max_dataset_size=16 * 100,\n",
227
- " bert_variety='bert-base-uncased',\n",
228
  " max_length=128,\n",
229
  " num_epochs=3,\n",
230
  " batch_size=32,\n",
@@ -256,27 +259,27 @@
256
  "source": [
257
  "model.eval()\n",
258
  "test_text = [\n",
259
- " 'This was a great experience!', # 0_not_relevant\n",
260
- " 'My favorite hike is Laguna de los Tres.', # 0_not_relevant\n",
261
- " 'Crops will grow great in Finland if it\\'s warmer there.', # 3_not_bad\n",
262
- " 'Climate change is fake.', # 1_not_happening\n",
263
- " 'The apparent warming is caused by solar cycles.', # 2_not_human\n",
264
- " 'Solar panels emit bad vibes.', # 4_solutions_harmful_unnecessary\n",
265
- " 'All those so-called scientists are Democrats.', # 6_proponents_biased\n",
266
  "]\n",
267
  "test_encoding = tokenizer(\n",
268
  " test_text,\n",
269
  " truncation=True,\n",
270
  " padding=True,\n",
271
- " return_tensors='pt',\n",
272
  ")\n",
273
  "\n",
274
  "with torch.no_grad():\n",
275
- " test_input_ids = test_encoding['input_ids'].to(device)\n",
276
- " test_attention_mask = test_encoding['attention_mask'].to(device)\n",
277
  " outputs = model(test_input_ids, test_attention_mask)\n",
278
  " predictions = torch.argmax(outputs, dim=1)\n",
279
- " my_print(f'Predictions: {predictions}')"
280
  ]
281
  },
282
  {
@@ -306,8 +309,8 @@
306
  ],
307
  "source": [
308
  "model, tokenizer = run_training(\n",
309
- " max_dataset_size='full',\n",
310
- " bert_variety='bert-base-uncased',\n",
311
  " max_length=64,\n",
312
  " num_epochs=3,\n",
313
  " batch_size=32,\n",
@@ -341,8 +344,8 @@
341
  ],
342
  "source": [
343
  "model, tokenizer = run_training(\n",
344
- " max_dataset_size='full',\n",
345
- " bert_variety='bert-base-uncased',\n",
346
  " max_length=128,\n",
347
  " num_epochs=3,\n",
348
  " batch_size=32,\n",
@@ -376,8 +379,8 @@
376
  ],
377
  "source": [
378
  "model, tokenizer = run_training(\n",
379
- " max_dataset_size='full',\n",
380
- " bert_variety='bert-base-uncased',\n",
381
  " max_length=128,\n",
382
  " num_epochs=3,\n",
383
  " batch_size=16,\n",
@@ -411,8 +414,8 @@
411
  ],
412
  "source": [
413
  "model, tokenizer = run_training(\n",
414
- " max_dataset_size='full',\n",
415
- " bert_variety='bert-base-uncased',\n",
416
  " max_length=256,\n",
417
  " num_epochs=3,\n",
418
  " batch_size=16,\n",
@@ -454,8 +457,8 @@
454
  ],
455
  "source": [
456
  "model_final, tokenizer_final = run_training(\n",
457
- " max_dataset_size='full',\n",
458
- " bert_variety='bert-base-uncased',\n",
459
  " max_length=128,\n",
460
  " num_epochs=3,\n",
461
  " batch_size=16,\n",
@@ -487,27 +490,27 @@
487
  "source": [
488
  "model_final.eval()\n",
489
  "test_text = [\n",
490
- " 'This was a great experience!', # 0_not_relevant\n",
491
- " 'My favorite hike is Laguna de los Tres.', # 0_not_relevant\n",
492
- " 'Crops will grow great in Finland if it\\'s warmer there.', # 3_not_bad\n",
493
- " 'Climate change is fake.', # 1_not_happening\n",
494
- " 'The apparent warming is caused by solar cycles.', # 2_not_human\n",
495
- " 'Solar panels emit bad vibes.', # 4_solutions_harmful_unnecessary\n",
496
- " 'All those so-called scientists are Democrats.', # 6_proponents_biased\n",
497
  "]\n",
498
  "test_encoding = tokenizer_final(\n",
499
  " test_text,\n",
500
  " truncation=True,\n",
501
  " padding=True,\n",
502
- " return_tensors='pt',\n",
503
  ")\n",
504
  "\n",
505
  "with torch.no_grad():\n",
506
- " test_input_ids = test_encoding['input_ids'].to(device)\n",
507
- " test_attention_mask = test_encoding['attention_mask'].to(device)\n",
508
  " outputs = model_final(test_input_ids, test_attention_mask)\n",
509
  " predictions = torch.argmax(outputs, dim=1)\n",
510
- " my_print(f'Predictions: {predictions}')"
511
  ]
512
  },
513
  {
@@ -550,7 +553,7 @@
550
  }
551
  ],
552
  "source": [
553
- "model_final.push_to_hub('frugal-ai-text-bert-base')"
554
  ]
555
  },
556
  {
@@ -593,7 +596,7 @@
593
  }
594
  ],
595
  "source": [
596
- "tokenizer_final.push_to_hub('frugal-ai-text-bert-base')"
597
  ]
598
  },
599
  {
 
62
  " time_str = datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\")\n",
63
  " print(time_str, x)\n",
64
  "\n",
65
+ "\n",
66
  "class BertClassifier(nn.Module, PyTorchModelHubMixin):\n",
67
+ " def __init__(self, num_labels=8, bert_variety=\"bert-base-uncased\"):\n",
68
  " super().__init__()\n",
69
  " self.bert = BertModel.from_pretrained(bert_variety)\n",
70
  " self.dropout = nn.Dropout(0.05)\n",
 
77
  " logits = self.classifier(pooled_output)\n",
78
  " return logits\n",
79
  "\n",
80
+ "\n",
81
  "class TextDataset(Dataset):\n",
82
  " def __init__(self, texts, labels, tokenizer, max_length=512):\n",
83
  " self.encodings = tokenizer(\n",
 
85
  " truncation=True,\n",
86
  " padding=True,\n",
87
  " max_length=max_length,\n",
88
+ " return_tensors=\"pt\",\n",
89
  " )\n",
90
  " self.labels = torch.tensor([int(l[0]) for l in labels])\n",
91
  "\n",
92
  " def __getitem__(self, idx):\n",
93
  " item = {key: val[idx] for key, val in self.encodings.items()}\n",
94
+ " item[\"labels\"] = self.labels[idx]\n",
95
  " return item\n",
96
  "\n",
97
  " def __len__(self) -> int:\n",
98
  " return len(self.labels)\n",
99
  "\n",
100
+ "\n",
101
  "def train_model(model, train_dataloader, device, num_epochs):\n",
102
  " optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)\n",
103
  " criterion = nn.CrossEntropyLoss()\n",
104
  " model.train()\n",
105
  "\n",
106
+ " my_print(\"Starting epoch 1.\")\n",
107
  " for epoch in range(num_epochs):\n",
108
  " total_loss = 0\n",
109
  " for batch in train_dataloader:\n",
110
  " optimizer.zero_grad()\n",
111
  "\n",
112
+ " input_ids = batch[\"input_ids\"].to(device)\n",
113
+ " attention_mask = batch[\"attention_mask\"].to(device)\n",
114
+ " labels = batch[\"labels\"].to(device)\n",
115
  "\n",
116
  " outputs = model(input_ids, attention_mask)\n",
117
  " loss = criterion(outputs, labels)\n",
 
121
  "\n",
122
  " total_loss += loss.item()\n",
123
  " avg_loss = total_loss / len(train_dataloader)\n",
124
+ " my_print(f\"Epoch {epoch+1}/{num_epochs} done, Average Loss: {avg_loss:0.4f}\")"
125
  ]
126
  },
127
  {
 
140
  "outputs": [],
141
  "source": [
142
  "if torch.backends.mps.is_available():\n",
143
+ " device = torch.device(\"mps\")\n",
144
  " torch.mps.empty_cache()\n",
145
  "elif torch.cuda.is_available():\n",
146
+ " device = torch.device(\"cuda\")\n",
147
  "else:\n",
148
+ " device = torch.device(\"cpu\")"
149
  ]
150
  },
151
  {
 
165
  "source": [
166
  "def run_training(\n",
167
  " max_dataset_size=16 * 200,\n",
168
+ " bert_variety=\"bert-base-uncased\",\n",
169
  " max_length=200,\n",
170
  " num_epochs=3,\n",
171
  " batch_size=32,\n",
172
  "):\n",
173
  " hf_dataset = load_dataset(\"quotaclimat/frugalaichallenge-text-train\")\n",
174
+ " if not max_dataset_size == \"full\" and max_dataset_size < len(hf_dataset[\"train\"]):\n",
175
+ " train_dataset = hf_dataset[\"train\"][:max_dataset_size]\n",
176
  " else:\n",
177
+ " train_dataset = hf_dataset[\"train\"]\n",
178
+ "\n",
179
  " tokenizer = BertTokenizer.from_pretrained(bert_variety, max_length=max_length)\n",
180
  " model = BertClassifier(bert_variety=bert_variety)\n",
181
  " if torch.backends.mps.is_available():\n",
182
+ " device = torch.device(\"mps\")\n",
183
  " torch.mps.empty_cache()\n",
184
  " elif torch.cuda.is_available():\n",
185
+ " device = torch.device(\"cuda\")\n",
186
  " else:\n",
187
+ " device = torch.device(\"cpu\")\n",
188
  " model.to(device)\n",
189
+ "\n",
190
  " dataset = TextDataset(\n",
191
+ " train_dataset[\"quote\"],\n",
192
+ " train_dataset[\"label\"],\n",
193
  " tokenizer=tokenizer,\n",
194
  " max_length=max_length,\n",
195
  " )\n",
196
  " dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)\n",
197
+ "\n",
198
  " train_model(model, dataloader, device, num_epochs=num_epochs)\n",
199
  " return model, tokenizer"
200
  ]
 
227
  "source": [
228
  "model, tokenizer = run_training(\n",
229
  " max_dataset_size=16 * 100,\n",
230
+ " bert_variety=\"bert-base-uncased\",\n",
231
  " max_length=128,\n",
232
  " num_epochs=3,\n",
233
  " batch_size=32,\n",
 
259
  "source": [
260
  "model.eval()\n",
261
  "test_text = [\n",
262
+ " \"This was a great experience!\", # 0_not_relevant\n",
263
+ " \"My favorite hike is Laguna de los Tres.\", # 0_not_relevant\n",
264
+ " \"Crops will grow great in Finland if it's warmer there.\", # 3_not_bad\n",
265
+ " \"Climate change is fake.\", # 1_not_happening\n",
266
+ " \"The apparent warming is caused by solar cycles.\", # 2_not_human\n",
267
+ " \"Solar panels emit bad vibes.\", # 4_solutions_harmful_unnecessary\n",
268
+ " \"All those so-called scientists are Democrats.\", # 6_proponents_biased\n",
269
  "]\n",
270
  "test_encoding = tokenizer(\n",
271
  " test_text,\n",
272
  " truncation=True,\n",
273
  " padding=True,\n",
274
+ " return_tensors=\"pt\",\n",
275
  ")\n",
276
  "\n",
277
  "with torch.no_grad():\n",
278
+ " test_input_ids = test_encoding[\"input_ids\"].to(device)\n",
279
+ " test_attention_mask = test_encoding[\"attention_mask\"].to(device)\n",
280
  " outputs = model(test_input_ids, test_attention_mask)\n",
281
  " predictions = torch.argmax(outputs, dim=1)\n",
282
+ " my_print(f\"Predictions: {predictions}\")"
283
  ]
284
  },
285
  {
 
309
  ],
310
  "source": [
311
  "model, tokenizer = run_training(\n",
312
+ " max_dataset_size=\"full\",\n",
313
+ " bert_variety=\"bert-base-uncased\",\n",
314
  " max_length=64,\n",
315
  " num_epochs=3,\n",
316
  " batch_size=32,\n",
 
344
  ],
345
  "source": [
346
  "model, tokenizer = run_training(\n",
347
+ " max_dataset_size=\"full\",\n",
348
+ " bert_variety=\"bert-base-uncased\",\n",
349
  " max_length=128,\n",
350
  " num_epochs=3,\n",
351
  " batch_size=32,\n",
 
379
  ],
380
  "source": [
381
  "model, tokenizer = run_training(\n",
382
+ " max_dataset_size=\"full\",\n",
383
+ " bert_variety=\"bert-base-uncased\",\n",
384
  " max_length=128,\n",
385
  " num_epochs=3,\n",
386
  " batch_size=16,\n",
 
414
  ],
415
  "source": [
416
  "model, tokenizer = run_training(\n",
417
+ " max_dataset_size=\"full\",\n",
418
+ " bert_variety=\"bert-base-uncased\",\n",
419
  " max_length=256,\n",
420
  " num_epochs=3,\n",
421
  " batch_size=16,\n",
 
457
  ],
458
  "source": [
459
  "model_final, tokenizer_final = run_training(\n",
460
+ " max_dataset_size=\"full\",\n",
461
+ " bert_variety=\"bert-base-uncased\",\n",
462
  " max_length=128,\n",
463
  " num_epochs=3,\n",
464
  " batch_size=16,\n",
 
490
  "source": [
491
  "model_final.eval()\n",
492
  "test_text = [\n",
493
+ " \"This was a great experience!\", # 0_not_relevant\n",
494
+ " \"My favorite hike is Laguna de los Tres.\", # 0_not_relevant\n",
495
+ " \"Crops will grow great in Finland if it's warmer there.\", # 3_not_bad\n",
496
+ " \"Climate change is fake.\", # 1_not_happening\n",
497
+ " \"The apparent warming is caused by solar cycles.\", # 2_not_human\n",
498
+ " \"Solar panels emit bad vibes.\", # 4_solutions_harmful_unnecessary\n",
499
+ " \"All those so-called scientists are Democrats.\", # 6_proponents_biased\n",
500
  "]\n",
501
  "test_encoding = tokenizer_final(\n",
502
  " test_text,\n",
503
  " truncation=True,\n",
504
  " padding=True,\n",
505
+ " return_tensors=\"pt\",\n",
506
  ")\n",
507
  "\n",
508
  "with torch.no_grad():\n",
509
+ " test_input_ids = test_encoding[\"input_ids\"].to(device)\n",
510
+ " test_attention_mask = test_encoding[\"attention_mask\"].to(device)\n",
511
  " outputs = model_final(test_input_ids, test_attention_mask)\n",
512
  " predictions = torch.argmax(outputs, dim=1)\n",
513
+ " my_print(f\"Predictions: {predictions}\")"
514
  ]
515
  },
516
  {
 
553
  }
554
  ],
555
  "source": [
556
+ "model_final.push_to_hub(\"frugal-ai-text-bert-base\")"
557
  ]
558
  },
559
  {
 
596
  }
597
  ],
598
  "source": [
599
+ "tokenizer_final.push_to_hub(\"frugal-ai-text-bert-base\")"
600
  ]
601
  },
602
  {
app.py CHANGED
@@ -7,7 +7,7 @@ load_dotenv()
7
 
8
  app = FastAPI(
9
  title="Frugal AI Challenge API",
10
- description="API for the Frugal AI Challenge evaluation endpoints"
11
  )
12
 
13
  # Include all routers
@@ -15,6 +15,7 @@ app.include_router(text.router)
15
  app.include_router(image.router)
16
  app.include_router(audio.router)
17
 
 
18
  @app.get("/")
19
  async def root():
20
  return {
@@ -22,6 +23,6 @@ async def root():
22
  "endpoints": {
23
  "text": "/text - Text classification task",
24
  "image": "/image - Image classification task (coming soon)",
25
- "audio": "/audio - Audio classification task (coming soon)"
26
- }
27
- }
 
7
 
8
  app = FastAPI(
9
  title="Frugal AI Challenge API",
10
+ description="API for the Frugal AI Challenge evaluation endpoints",
11
  )
12
 
13
  # Include all routers
 
15
  app.include_router(image.router)
16
  app.include_router(audio.router)
17
 
18
+
19
  @app.get("/")
20
  async def root():
21
  return {
 
23
  "endpoints": {
24
  "text": "/text - Text classification task",
25
  "image": "/image - Image classification task (coming soon)",
26
+ "audio": "/audio - Audio classification task (coming soon)",
27
+ },
28
+ }
tasks/audio.py CHANGED
@@ -9,6 +9,7 @@ from .utils.evaluation import AudioEvaluationRequest
9
  from .utils.emissions import tracker, clean_emissions_data, get_space_info
10
 
11
  from dotenv import load_dotenv
 
12
  load_dotenv()
13
 
14
  router = APIRouter()
@@ -17,13 +18,11 @@ DESCRIPTION = "Random Baseline"
17
  ROUTE = "/audio"
18
 
19
 
20
-
21
- @router.post(ROUTE, tags=["Audio Task"],
22
- description=DESCRIPTION)
23
  async def evaluate_audio(request: AudioEvaluationRequest):
24
  """
25
  Evaluate audio classification for rainforest sound detection.
26
-
27
  Current Model: Random Baseline
28
  - Makes random predictions from the label space (0-1)
29
  - Used as a baseline for comparison
@@ -32,41 +31,40 @@ async def evaluate_audio(request: AudioEvaluationRequest):
32
  username, space_url = get_space_info()
33
 
34
  # Define the label mapping
35
- LABEL_MAPPING = {
36
- "chainsaw": 0,
37
- "environment": 1
38
- }
39
  # Load and prepare the dataset
40
  # Because the dataset is gated, we need to use the HF_TOKEN environment variable to authenticate
41
- dataset = load_dataset(request.dataset_name,token=os.getenv("HF_TOKEN"))
42
-
43
  # Split dataset
44
- train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed)
 
 
45
  test_dataset = train_test["test"]
46
-
47
  # Start tracking emissions
48
  tracker.start()
49
  tracker.start_task("inference")
50
-
51
- #--------------------------------------------------------------------------------------------
52
  # YOUR MODEL INFERENCE CODE HERE
53
  # Update the code below to replace the random baseline by your model inference within the inference pass where the energy consumption and emissions are tracked.
54
- #--------------------------------------------------------------------------------------------
55
-
56
  # Make random predictions (placeholder for actual model inference)
57
  true_labels = test_dataset["label"]
58
  predictions = [random.randint(0, 1) for _ in range(len(true_labels))]
59
-
60
- #--------------------------------------------------------------------------------------------
61
  # YOUR MODEL INFERENCE STOPS HERE
62
- #--------------------------------------------------------------------------------------------
63
-
64
  # Stop tracking emissions
65
  emissions_data = tracker.stop_task()
66
-
67
  # Calculate accuracy
68
  accuracy = accuracy_score(true_labels, predictions)
69
-
70
  # Prepare results dictionary
71
  results = {
72
  "username": username,
@@ -81,8 +79,8 @@ async def evaluate_audio(request: AudioEvaluationRequest):
81
  "dataset_config": {
82
  "dataset_name": request.dataset_name,
83
  "test_size": request.test_size,
84
- "test_seed": request.test_seed
85
- }
86
  }
87
-
88
- return results
 
9
  from .utils.emissions import tracker, clean_emissions_data, get_space_info
10
 
11
  from dotenv import load_dotenv
12
+
13
  load_dotenv()
14
 
15
  router = APIRouter()
 
18
  ROUTE = "/audio"
19
 
20
 
21
+ @router.post(ROUTE, tags=["Audio Task"], description=DESCRIPTION)
 
 
22
  async def evaluate_audio(request: AudioEvaluationRequest):
23
  """
24
  Evaluate audio classification for rainforest sound detection.
25
+
26
  Current Model: Random Baseline
27
  - Makes random predictions from the label space (0-1)
28
  - Used as a baseline for comparison
 
31
  username, space_url = get_space_info()
32
 
33
  # Define the label mapping
34
+ LABEL_MAPPING = {"chainsaw": 0, "environment": 1}
 
 
 
35
  # Load and prepare the dataset
36
  # Because the dataset is gated, we need to use the HF_TOKEN environment variable to authenticate
37
+ dataset = load_dataset(request.dataset_name, token=os.getenv("HF_TOKEN"))
38
+
39
  # Split dataset
40
+ train_test = dataset["train"].train_test_split(
41
+ test_size=request.test_size, seed=request.test_seed
42
+ )
43
  test_dataset = train_test["test"]
44
+
45
  # Start tracking emissions
46
  tracker.start()
47
  tracker.start_task("inference")
48
+
49
+ # --------------------------------------------------------------------------------------------
50
  # YOUR MODEL INFERENCE CODE HERE
51
  # Update the code below to replace the random baseline by your model inference within the inference pass where the energy consumption and emissions are tracked.
52
+ # --------------------------------------------------------------------------------------------
53
+
54
  # Make random predictions (placeholder for actual model inference)
55
  true_labels = test_dataset["label"]
56
  predictions = [random.randint(0, 1) for _ in range(len(true_labels))]
57
+
58
+ # --------------------------------------------------------------------------------------------
59
  # YOUR MODEL INFERENCE STOPS HERE
60
+ # --------------------------------------------------------------------------------------------
61
+
62
  # Stop tracking emissions
63
  emissions_data = tracker.stop_task()
64
+
65
  # Calculate accuracy
66
  accuracy = accuracy_score(true_labels, predictions)
67
+
68
  # Prepare results dictionary
69
  results = {
70
  "username": username,
 
79
  "dataset_config": {
80
  "dataset_name": request.dataset_name,
81
  "test_size": request.test_size,
82
+ "test_seed": request.test_seed,
83
+ },
84
  }
85
+
86
+ return results
tasks/image.py CHANGED
@@ -10,6 +10,7 @@ from .utils.evaluation import ImageEvaluationRequest
10
  from .utils.emissions import tracker, clean_emissions_data, get_space_info
11
 
12
  from dotenv import load_dotenv
 
13
  load_dotenv()
14
 
15
  router = APIRouter()
@@ -17,6 +18,7 @@ router = APIRouter()
17
  DESCRIPTION = "Random Baseline"
18
  ROUTE = "/image"
19
 
 
20
  def parse_boxes(annotation_string):
21
  """Parse multiple boxes from a single annotation string.
22
  Each box has 5 values: class_id, x_center, y_center, width, height"""
@@ -26,39 +28,46 @@ def parse_boxes(annotation_string):
26
  for i in range(0, len(values), 5):
27
  if i + 5 <= len(values):
28
  # Skip class_id (first value) and take the next 4 values
29
- box = values[i+1:i+5]
30
  boxes.append(box)
31
  return boxes
32
 
 
33
  def compute_iou(box1, box2):
34
  """Compute Intersection over Union (IoU) between two YOLO format boxes."""
 
35
  # Convert YOLO format (x_center, y_center, width, height) to corners
36
  def yolo_to_corners(box):
37
  x_center, y_center, width, height = box
38
- x1 = x_center - width/2
39
- y1 = y_center - height/2
40
- x2 = x_center + width/2
41
- y2 = y_center + height/2
42
  return np.array([x1, y1, x2, y2])
43
-
44
  box1_corners = yolo_to_corners(box1)
45
  box2_corners = yolo_to_corners(box2)
46
-
47
  # Calculate intersection
48
  x1 = max(box1_corners[0], box2_corners[0])
49
  y1 = max(box1_corners[1], box2_corners[1])
50
  x2 = min(box1_corners[2], box2_corners[2])
51
  y2 = min(box1_corners[3], box2_corners[3])
52
-
53
  intersection = max(0, x2 - x1) * max(0, y2 - y1)
54
-
55
  # Calculate union
56
- box1_area = (box1_corners[2] - box1_corners[0]) * (box1_corners[3] - box1_corners[1])
57
- box2_area = (box2_corners[2] - box2_corners[0]) * (box2_corners[3] - box2_corners[1])
 
 
 
 
58
  union = box1_area + box2_area - intersection
59
-
60
  return intersection / (union + 1e-6)
61
 
 
62
  def compute_max_iou(true_boxes, pred_box):
63
  """Compute maximum IoU between a predicted box and all true boxes"""
64
  max_iou = 0
@@ -67,89 +76,91 @@ def compute_max_iou(true_boxes, pred_box):
67
  max_iou = max(max_iou, iou)
68
  return max_iou
69
 
70
- @router.post(ROUTE, tags=["Image Task"],
71
- description=DESCRIPTION)
72
  async def evaluate_image(request: ImageEvaluationRequest):
73
  """
74
  Evaluate image classification and object detection for forest fire smoke.
75
-
76
  Current Model: Random Baseline
77
  - Makes random predictions for both classification and bounding boxes
78
  - Used as a baseline for comparison
79
-
80
  Metrics:
81
  - Classification accuracy: Whether an image contains smoke or not
82
  - Object Detection accuracy: IoU (Intersection over Union) for smoke bounding boxes
83
  """
84
  # Get space info
85
  username, space_url = get_space_info()
86
-
87
  # Load and prepare the dataset
88
  dataset = load_dataset(request.dataset_name, token=os.getenv("HF_TOKEN"))
89
-
90
  # Split dataset
91
- train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed)
 
 
92
  test_dataset = train_test["test"]
93
-
94
  # Start tracking emissions
95
  tracker.start()
96
  tracker.start_task("inference")
97
-
98
- #--------------------------------------------------------------------------------------------
99
  # YOUR MODEL INFERENCE CODE HERE
100
  # Update the code below to replace the random baseline with your model inference
101
- #--------------------------------------------------------------------------------------------
102
-
103
  predictions = []
104
  true_labels = []
105
  pred_boxes = []
106
  true_boxes_list = [] # List of lists, each inner list contains boxes for one image
107
-
108
  for example in test_dataset:
109
  # Parse true annotation (YOLO format: class_id x_center y_center width height)
110
  annotation = example.get("annotations", "").strip()
111
  has_smoke = len(annotation) > 0
112
  true_labels.append(int(has_smoke))
113
-
114
  # Make random classification prediction
115
  pred_has_smoke = random.random() > 0.5
116
  predictions.append(int(pred_has_smoke))
117
-
118
  # If there's a true box, parse it and make random box prediction
119
  if has_smoke:
120
  # Parse all true boxes from the annotation
121
  image_true_boxes = parse_boxes(annotation)
122
  true_boxes_list.append(image_true_boxes)
123
-
124
  # For baseline, make one random box prediction per image
125
  # In a real model, you might want to predict multiple boxes
126
  random_box = [
127
  random.random(), # x_center
128
  random.random(), # y_center
129
  random.random() * 0.5, # width (max 0.5)
130
- random.random() * 0.5 # height (max 0.5)
131
  ]
132
  pred_boxes.append(random_box)
133
-
134
- #--------------------------------------------------------------------------------------------
135
  # YOUR MODEL INFERENCE STOPS HERE
136
- #--------------------------------------------------------------------------------------------
137
-
138
  # Stop tracking emissions
139
  emissions_data = tracker.stop_task()
140
-
141
  # Calculate classification accuracy
142
  classification_accuracy = accuracy_score(true_labels, predictions)
143
-
144
  # Calculate mean IoU for object detection (only for images with smoke)
145
  # For each image, we compute the max IoU between the predicted box and all true boxes
146
  ious = []
147
  for true_boxes, pred_box in zip(true_boxes_list, pred_boxes):
148
  max_iou = compute_max_iou(true_boxes, pred_box)
149
  ious.append(max_iou)
150
-
151
  mean_iou = float(np.mean(ious)) if ious else 0.0
152
-
153
  # Prepare results dictionary
154
  results = {
155
  "username": username,
@@ -165,8 +176,8 @@ async def evaluate_image(request: ImageEvaluationRequest):
165
  "dataset_config": {
166
  "dataset_name": request.dataset_name,
167
  "test_size": request.test_size,
168
- "test_seed": request.test_seed
169
- }
170
  }
171
-
172
- return results
 
10
  from .utils.emissions import tracker, clean_emissions_data, get_space_info
11
 
12
  from dotenv import load_dotenv
13
+
14
  load_dotenv()
15
 
16
  router = APIRouter()
 
18
  DESCRIPTION = "Random Baseline"
19
  ROUTE = "/image"
20
 
21
+
22
  def parse_boxes(annotation_string):
23
  """Parse multiple boxes from a single annotation string.
24
  Each box has 5 values: class_id, x_center, y_center, width, height"""
 
28
  for i in range(0, len(values), 5):
29
  if i + 5 <= len(values):
30
  # Skip class_id (first value) and take the next 4 values
31
+ box = values[i + 1 : i + 5]
32
  boxes.append(box)
33
  return boxes
34
 
35
+
36
  def compute_iou(box1, box2):
37
  """Compute Intersection over Union (IoU) between two YOLO format boxes."""
38
+
39
  # Convert YOLO format (x_center, y_center, width, height) to corners
40
  def yolo_to_corners(box):
41
  x_center, y_center, width, height = box
42
+ x1 = x_center - width / 2
43
+ y1 = y_center - height / 2
44
+ x2 = x_center + width / 2
45
+ y2 = y_center + height / 2
46
  return np.array([x1, y1, x2, y2])
47
+
48
  box1_corners = yolo_to_corners(box1)
49
  box2_corners = yolo_to_corners(box2)
50
+
51
  # Calculate intersection
52
  x1 = max(box1_corners[0], box2_corners[0])
53
  y1 = max(box1_corners[1], box2_corners[1])
54
  x2 = min(box1_corners[2], box2_corners[2])
55
  y2 = min(box1_corners[3], box2_corners[3])
56
+
57
  intersection = max(0, x2 - x1) * max(0, y2 - y1)
58
+
59
  # Calculate union
60
+ box1_area = (box1_corners[2] - box1_corners[0]) * (
61
+ box1_corners[3] - box1_corners[1]
62
+ )
63
+ box2_area = (box2_corners[2] - box2_corners[0]) * (
64
+ box2_corners[3] - box2_corners[1]
65
+ )
66
  union = box1_area + box2_area - intersection
67
+
68
  return intersection / (union + 1e-6)
69
 
70
+
71
  def compute_max_iou(true_boxes, pred_box):
72
  """Compute maximum IoU between a predicted box and all true boxes"""
73
  max_iou = 0
 
76
  max_iou = max(max_iou, iou)
77
  return max_iou
78
 
79
+
80
+ @router.post(ROUTE, tags=["Image Task"], description=DESCRIPTION)
81
  async def evaluate_image(request: ImageEvaluationRequest):
82
  """
83
  Evaluate image classification and object detection for forest fire smoke.
84
+
85
  Current Model: Random Baseline
86
  - Makes random predictions for both classification and bounding boxes
87
  - Used as a baseline for comparison
88
+
89
  Metrics:
90
  - Classification accuracy: Whether an image contains smoke or not
91
  - Object Detection accuracy: IoU (Intersection over Union) for smoke bounding boxes
92
  """
93
  # Get space info
94
  username, space_url = get_space_info()
95
+
96
  # Load and prepare the dataset
97
  dataset = load_dataset(request.dataset_name, token=os.getenv("HF_TOKEN"))
98
+
99
  # Split dataset
100
+ train_test = dataset["train"].train_test_split(
101
+ test_size=request.test_size, seed=request.test_seed
102
+ )
103
  test_dataset = train_test["test"]
104
+
105
  # Start tracking emissions
106
  tracker.start()
107
  tracker.start_task("inference")
108
+
109
+ # --------------------------------------------------------------------------------------------
110
  # YOUR MODEL INFERENCE CODE HERE
111
  # Update the code below to replace the random baseline with your model inference
112
+ # --------------------------------------------------------------------------------------------
113
+
114
  predictions = []
115
  true_labels = []
116
  pred_boxes = []
117
  true_boxes_list = [] # List of lists, each inner list contains boxes for one image
118
+
119
  for example in test_dataset:
120
  # Parse true annotation (YOLO format: class_id x_center y_center width height)
121
  annotation = example.get("annotations", "").strip()
122
  has_smoke = len(annotation) > 0
123
  true_labels.append(int(has_smoke))
124
+
125
  # Make random classification prediction
126
  pred_has_smoke = random.random() > 0.5
127
  predictions.append(int(pred_has_smoke))
128
+
129
  # If there's a true box, parse it and make random box prediction
130
  if has_smoke:
131
  # Parse all true boxes from the annotation
132
  image_true_boxes = parse_boxes(annotation)
133
  true_boxes_list.append(image_true_boxes)
134
+
135
  # For baseline, make one random box prediction per image
136
  # In a real model, you might want to predict multiple boxes
137
  random_box = [
138
  random.random(), # x_center
139
  random.random(), # y_center
140
  random.random() * 0.5, # width (max 0.5)
141
+ random.random() * 0.5, # height (max 0.5)
142
  ]
143
  pred_boxes.append(random_box)
144
+
145
+ # --------------------------------------------------------------------------------------------
146
  # YOUR MODEL INFERENCE STOPS HERE
147
+ # --------------------------------------------------------------------------------------------
148
+
149
  # Stop tracking emissions
150
  emissions_data = tracker.stop_task()
151
+
152
  # Calculate classification accuracy
153
  classification_accuracy = accuracy_score(true_labels, predictions)
154
+
155
  # Calculate mean IoU for object detection (only for images with smoke)
156
  # For each image, we compute the max IoU between the predicted box and all true boxes
157
  ious = []
158
  for true_boxes, pred_box in zip(true_boxes_list, pred_boxes):
159
  max_iou = compute_max_iou(true_boxes, pred_box)
160
  ious.append(max_iou)
161
+
162
  mean_iou = float(np.mean(ious)) if ious else 0.0
163
+
164
  # Prepare results dictionary
165
  results = {
166
  "username": username,
 
176
  "dataset_config": {
177
  "dataset_name": request.dataset_name,
178
  "test_size": request.test_size,
179
+ "test_seed": request.test_seed,
180
+ },
181
  }
182
+
183
+ return results
tasks/text.py CHANGED
@@ -12,13 +12,21 @@ from .utils.emissions import tracker, clean_emissions_data, get_space_info
12
 
13
  router = APIRouter()
14
 
15
- DESCRIPTION = "bert base finetuned"
 
 
 
 
 
 
 
 
16
  ROUTE = "/text"
17
 
18
 
19
  def baseline_model(dataset_length: int):
20
  # Make random predictions (placeholder for actual model inference)
21
- #predictions = [random.randint(0, 7) for _ in range(dataset_length)]
22
 
23
  # My favorate baseline is the most common class.
24
  predictions = [0] * dataset_length
@@ -26,48 +34,50 @@ def baseline_model(dataset_length: int):
26
  return predictions
27
 
28
 
29
- def bert_model(test_dataset):
30
- print('Starting my code block.')
31
  texts = test_dataset["quote"]
32
 
33
- model_repo = 'Nonnormalizable/frugal-ai-text-bert-base'
34
  config = AutoConfig.from_pretrained(model_repo)
35
  model = AutoModelForSequenceClassification.from_pretrained(model_repo)
36
  tokenizer = AutoTokenizer.from_pretrained(model_repo)
37
 
38
  if torch.cuda.is_available():
39
- device = torch.device('cuda')
40
  else:
41
- device = torch.device('cpu')
42
- print('device:', device)
43
  model = model.to(device)
44
  test_encoding = tokenizer(
45
  texts,
46
  truncation=True,
47
  padding=True,
48
- return_tensors='pt',
49
- )
50
 
51
  model.eval()
52
  with torch.no_grad():
53
- test_input_ids = test_encoding['input_ids'].to(device)
54
- test_attention_mask = test_encoding['attention_mask'].to(device)
55
- print('Starting model run.')
56
  outputs = model(test_input_ids, test_attention_mask)
57
- print('End of model run.')
58
  predictions = torch.argmax(outputs.logits, dim=1)
59
  predictions = predictions.cpu().numpy()
60
-
61
- print('End of my code block.')
62
  return predictions
63
 
64
 
65
- @router.post(ROUTE, tags=["Text Task"],
66
- description=DESCRIPTION)
67
- async def evaluate_text(request: TextEvaluationRequest):
 
 
68
  """
69
  Evaluate text classification for climate disinformation detection.
70
-
71
  Current Model: Random Baseline
72
  - Makes random predictions from the label space (0-7)
73
  - Used as a baseline for comparison
@@ -84,7 +94,7 @@ async def evaluate_text(request: TextEvaluationRequest):
84
  "4_solutions_harmful_unnecessary": 4,
85
  "5_science_unreliable": 5,
86
  "6_proponents_biased": 6,
87
- "7_fossil_fuels_needed": 7
88
  }
89
 
90
  # Load and prepare the dataset
@@ -94,39 +104,44 @@ async def evaluate_text(request: TextEvaluationRequest):
94
  dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})
95
 
96
  # Split dataset
97
- train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed)
 
 
98
  test_dataset = train_test["test"]
99
-
100
  # Start tracking emissions
101
  tracker.start()
102
  tracker.start_task("inference")
103
 
104
- #--------------------------------------------------------------------------------------------
105
  # YOUR MODEL INFERENCE CODE HERE
106
  # Update the code below to replace the random baseline by your model inference within the inference pass where the energy consumption and emissions are tracked.
107
- #--------------------------------------------------------------------------------------------
108
 
109
  true_labels = test_dataset["label"]
110
- #predictions = baseline_model(len(true_labels))
111
- predictions = bert_model(test_dataset)
 
 
 
 
112
 
113
- #--------------------------------------------------------------------------------------------
114
  # YOUR MODEL INFERENCE STOPS HERE
115
- #--------------------------------------------------------------------------------------------
116
 
117
-
118
  # Stop tracking emissions
119
  emissions_data = tracker.stop_task()
120
-
121
  # Calculate accuracy
122
  accuracy = accuracy_score(true_labels, predictions)
123
-
124
  # Prepare results dictionary
125
  results = {
126
  "username": username,
127
  "space_url": space_url,
128
  "submission_timestamp": datetime.now().isoformat(),
129
- "model_description": DESCRIPTION,
130
  "accuracy": float(accuracy),
131
  "energy_consumed_wh": emissions_data.energy_consumed * 1000,
132
  "emissions_gco2eq": emissions_data.emissions * 1000,
@@ -135,8 +150,8 @@ async def evaluate_text(request: TextEvaluationRequest):
135
  "dataset_config": {
136
  "dataset_name": request.dataset_name,
137
  "test_size": request.test_size,
138
- "test_seed": request.test_seed
139
- }
140
  }
141
-
142
- return results
 
12
 
13
  router = APIRouter()
14
 
15
+ DESCRIPTIONS = {
16
+ "baseline": "baseline most common class",
17
+ "bert-base": "bert base finetuned",
18
+ "bert-medium": "to be implemented",
19
+ "bert-small": "to be implemented",
20
+ "bert-mini": "to be implemented",
21
+ "bert-tiny": "to be implemented",
22
+ }
23
+
24
  ROUTE = "/text"
25
 
26
 
27
  def baseline_model(dataset_length: int):
28
  # Make random predictions (placeholder for actual model inference)
29
+ # predictions = [random.randint(0, 7) for _ in range(dataset_length)]
30
 
31
  # My favorate baseline is the most common class.
32
  predictions = [0] * dataset_length
 
34
  return predictions
35
 
36
 
37
+ def bert_model(test_dataset: dict, model_type: str):
38
+ print("Starting my code block.")
39
  texts = test_dataset["quote"]
40
 
41
+ model_repo = f"Nonnormalizable/frugal-ai-text-{model_type}"
42
  config = AutoConfig.from_pretrained(model_repo)
43
  model = AutoModelForSequenceClassification.from_pretrained(model_repo)
44
  tokenizer = AutoTokenizer.from_pretrained(model_repo)
45
 
46
  if torch.cuda.is_available():
47
+ device = torch.device("cuda")
48
  else:
49
+ device = torch.device("cpu")
50
+ print("device:", device)
51
  model = model.to(device)
52
  test_encoding = tokenizer(
53
  texts,
54
  truncation=True,
55
  padding=True,
56
+ return_tensors="pt",
57
+ )
58
 
59
  model.eval()
60
  with torch.no_grad():
61
+ test_input_ids = test_encoding["input_ids"].to(device)
62
+ test_attention_mask = test_encoding["attention_mask"].to(device)
63
+ print("Starting model run.")
64
  outputs = model(test_input_ids, test_attention_mask)
65
+ print("End of model run.")
66
  predictions = torch.argmax(outputs.logits, dim=1)
67
  predictions = predictions.cpu().numpy()
68
+
69
+ print("End of my code block.")
70
  return predictions
71
 
72
 
73
+ @router.post(ROUTE, tags=["Text Task"])
74
+ async def evaluate_text(
75
+ request: TextEvaluationRequest,
76
+ model_type="bert-base",
77
+ ):
78
  """
79
  Evaluate text classification for climate disinformation detection.
80
+
81
  Current Model: Random Baseline
82
  - Makes random predictions from the label space (0-7)
83
  - Used as a baseline for comparison
 
94
  "4_solutions_harmful_unnecessary": 4,
95
  "5_science_unreliable": 5,
96
  "6_proponents_biased": 6,
97
+ "7_fossil_fuels_needed": 7,
98
  }
99
 
100
  # Load and prepare the dataset
 
104
  dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})
105
 
106
  # Split dataset
107
+ train_test = dataset["train"].train_test_split(
108
+ test_size=request.test_size, seed=request.test_seed
109
+ )
110
  test_dataset = train_test["test"]
111
+
112
  # Start tracking emissions
113
  tracker.start()
114
  tracker.start_task("inference")
115
 
116
+ # --------------------------------------------------------------------------------------------
117
  # YOUR MODEL INFERENCE CODE HERE
118
  # Update the code below to replace the random baseline by your model inference within the inference pass where the energy consumption and emissions are tracked.
119
+ # --------------------------------------------------------------------------------------------
120
 
121
  true_labels = test_dataset["label"]
122
+ if model_type == "baseline":
123
+ predictions = baseline_model(len(true_labels))
124
+ elif model_type[:5] == "bert-":
125
+ predictions = bert_model(test_dataset, model_type)
126
+ else:
127
+ raise ValueError(model_type)
128
 
129
+ # --------------------------------------------------------------------------------------------
130
  # YOUR MODEL INFERENCE STOPS HERE
131
+ # --------------------------------------------------------------------------------------------
132
 
 
133
  # Stop tracking emissions
134
  emissions_data = tracker.stop_task()
135
+
136
  # Calculate accuracy
137
  accuracy = accuracy_score(true_labels, predictions)
138
+
139
  # Prepare results dictionary
140
  results = {
141
  "username": username,
142
  "space_url": space_url,
143
  "submission_timestamp": datetime.now().isoformat(),
144
+ "model_description": DESCRIPTIONS[model_type],
145
  "accuracy": float(accuracy),
146
  "energy_consumed_wh": emissions_data.energy_consumed * 1000,
147
  "emissions_gco2eq": emissions_data.emissions * 1000,
 
150
  "dataset_config": {
151
  "dataset_name": request.dataset_name,
152
  "test_size": request.test_size,
153
+ "test_seed": request.test_seed,
154
+ },
155
  }
156
+
157
+ return results
tasks/utils/emissions.py CHANGED
@@ -4,17 +4,26 @@ import os
4
  # Initialize tracker
5
  tracker = EmissionsTracker(allow_multiple_runs=True)
6
 
 
7
  class EmissionsData:
8
  def __init__(self, energy_consumed: float, emissions: float):
9
  self.energy_consumed = energy_consumed
10
  self.emissions = emissions
11
 
 
12
  def clean_emissions_data(emissions_data):
13
  """Remove unwanted fields from emissions data"""
14
  data_dict = emissions_data.__dict__
15
- fields_to_remove = ['timestamp', 'project_name', 'experiment_id', 'latitude', 'longitude']
 
 
 
 
 
 
16
  return {k: v for k, v in data_dict.items() if k not in fields_to_remove}
17
 
 
18
  def get_space_info():
19
  """Get the space username and URL from environment variables"""
20
  space_name = os.getenv("SPACE_ID", "")
@@ -25,4 +34,4 @@ def get_space_info():
25
  return username, space_url
26
  except Exception as e:
27
  print(f"Error getting space info: {e}")
28
- return "local-user", "local-development"
 
4
  # Initialize tracker
5
  tracker = EmissionsTracker(allow_multiple_runs=True)
6
 
7
+
8
  class EmissionsData:
9
  def __init__(self, energy_consumed: float, emissions: float):
10
  self.energy_consumed = energy_consumed
11
  self.emissions = emissions
12
 
13
+
14
  def clean_emissions_data(emissions_data):
15
  """Remove unwanted fields from emissions data"""
16
  data_dict = emissions_data.__dict__
17
+ fields_to_remove = [
18
+ "timestamp",
19
+ "project_name",
20
+ "experiment_id",
21
+ "latitude",
22
+ "longitude",
23
+ ]
24
  return {k: v for k, v in data_dict.items() if k not in fields_to_remove}
25
 
26
+
27
  def get_space_info():
28
  """Get the space username and URL from environment variables"""
29
  space_name = os.getenv("SPACE_ID", "")
 
34
  return username, space_url
35
  except Exception as e:
36
  print(f"Error getting space info: {e}")
37
+ return "local-user", "local-development"
tasks/utils/evaluation.py CHANGED
@@ -1,18 +1,28 @@
1
  from typing import Optional
2
  from pydantic import BaseModel, Field
3
 
 
4
  class BaseEvaluationRequest(BaseModel):
5
- test_size: float = Field(0.2, ge=0.0, le=1.0, description="Size of the test split (between 0 and 1)")
 
 
6
  test_seed: int = Field(42, ge=0, description="Random seed for reproducibility")
7
 
 
8
  class TextEvaluationRequest(BaseEvaluationRequest):
9
- dataset_name: str = Field("QuotaClimat/frugalaichallenge-text-train",
10
- description="The name of the dataset on HuggingFace Hub")
 
 
 
11
 
12
  class ImageEvaluationRequest(BaseEvaluationRequest):
13
- dataset_name: str = Field("pyronear/pyro-sdis",
14
- description="The name of the dataset on HuggingFace Hub")
 
 
15
 
16
  class AudioEvaluationRequest(BaseEvaluationRequest):
17
- dataset_name: str = Field("rfcx/frugalai",
18
- description="The name of the dataset on HuggingFace Hub")
 
 
1
  from typing import Optional
2
  from pydantic import BaseModel, Field
3
 
4
+
5
  class BaseEvaluationRequest(BaseModel):
6
+ test_size: float = Field(
7
+ 0.2, ge=0.0, le=1.0, description="Size of the test split (between 0 and 1)"
8
+ )
9
  test_seed: int = Field(42, ge=0, description="Random seed for reproducibility")
10
 
11
+
12
  class TextEvaluationRequest(BaseEvaluationRequest):
13
+ dataset_name: str = Field(
14
+ "QuotaClimat/frugalaichallenge-text-train",
15
+ description="The name of the dataset on HuggingFace Hub",
16
+ )
17
+
18
 
19
  class ImageEvaluationRequest(BaseEvaluationRequest):
20
+ dataset_name: str = Field(
21
+ "pyronear/pyro-sdis", description="The name of the dataset on HuggingFace Hub"
22
+ )
23
+
24
 
25
  class AudioEvaluationRequest(BaseEvaluationRequest):
26
+ dataset_name: str = Field(
27
+ "rfcx/frugalai", description="The name of the dataset on HuggingFace Hub"
28
+ )