File size: 2,189 Bytes
5630ebc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
"""
This script is used to curate the data for the project. 

Implement your functions to to clean the data and prepare it for model training.

Note: the competition requires that you use FiftyOne for data curation and you are only allowed to
use the approaved dataset from the hub, Voxel51/Data-Centric-Visual-AI-Challenge-Train-Set, which can 
be found here: https://huggingface.co/datasets/Voxel51/Data-Centric-Visual-AI-Challenge-Train-Set
"""

import fiftyone as fo
import fiftyone.utils.huggingface as fouh

# Implement functions for data curation. below are just dummy functions as examples

def shuffle_data(dataset):
    """Shuffle the dataset"""
    return dataset.shuffle(seed=51)

def take_random_sample(dataset):
    """Take a sample from the dataset"""
    return dataset.take(size=10,seed=51)

def prepare_dataset(name):
    """
    Prepare the dataset for model training.
    
    Args:
        name (str): The name of the dataset to load. Must be "Voxel51/Data-Centric-Visual-AI-Challenge-Train-Set".
    
    Returns:
        fiftyone.core.dataset.Dataset: The curated dataset.
    
    Raises:
        ValueError: If the provided dataset name is not the approved one.
    
    Note:
        The following code block MUST NOT be removed from your submission:
        
        APPROVED_DATASET = "Voxel51/Data-Centric-Visual-AI-Challenge-Train-Set"
        
        if name != APPROVED_DATASET:
            raise ValueError(f"Only the approved dataset '{APPROVED_DATASET}' is allowed for this competition.")
        
        This ensures that only the approved dataset is used for the competition.
    """
    APPROVED_DATASET = "Voxel51/Data-Centric-Visual-AI-Challenge-Train-Set"
    Vox
    if name != APPROVED_DATASET:
        raise ValueError(f"Only the approved dataset '{APPROVED_DATASET}' is allowed for this competition.")
    
    # Load the approved dataset from the hub
    dataset = fouh.load_from_hub(name, split="train")
    
    # Implement your data curation functions here
    dataset = shuffle_data(dataset)
    dataset = take_random_sample(dataset)
    
    # Return the curated dataset
    curated_dataset = dataset.clone() 
    return curated_dataset