streamlit_demo / src /datasets /asvspoof_dataset.py
ldhldh's picture
Upload 28 files
2c0f55c verified
from pathlib import Path
import pandas as pd
if __name__ == "__main__":
import sys
sys.path.append(str(Path(__file__).parent.parent.parent.absolute()))
from src.datasets.base_dataset import SimpleAudioFakeDataset
ASVSPOOF_SPLIT = {
"train": ['A01', 'A07', 'A08', 'A02', 'A09', 'A10', 'A03', 'A04', 'A05', 'A06', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16', 'A17', 'A18', 'A19'],
"test": ['A01', 'A07', 'A08', 'A02', 'A09', 'A10', 'A03', 'A04', 'A05', 'A06', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16', 'A17', 'A18', 'A19'],
"val": ['A01', 'A07', 'A08', 'A02', 'A09', 'A10', 'A03', 'A04', 'A05', 'A06', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16', 'A17', 'A18', 'A19'],
"partition_ratio": [0.7, 0.15],
"seed": 45,
}
class ASVSpoofDataset(SimpleAudioFakeDataset):
protocol_folder_name = "ASVspoof2019_LA_cm_protocols"
subset_dir_prefix = "ASVspoof2019_LA_"
subsets = ("train", "dev", "eval")
def __init__(self, path, subset="train", transform=None):
super().__init__(subset, transform)
self.path = path
self.allowed_attacks = ASVSPOOF_SPLIT[subset]
self.partition_ratio = ASVSPOOF_SPLIT["partition_ratio"]
self.seed = ASVSPOOF_SPLIT["seed"]
self.samples = pd.DataFrame()
for subset in self.subsets:
subset_dir = Path(self.path) / f"{self.subset_dir_prefix}{subset}"
subset_protocol_path = self.get_protocol_path(subset)
subset_samples = self.read_protocol(subset_dir, subset_protocol_path)
self.samples = pd.concat([self.samples, subset_samples])
self.transform = transform
def get_protocol_path(self, subset):
paths = list((Path(self.path) / self.protocol_folder_name).glob("*.txt"))
for path in paths:
if subset in Path(path).stem:
return path
def read_protocol(self, subset_dir, protocol_path):
samples = {
"user_id": [],
"sample_name": [],
"attack_type": [],
"label": [],
"path": []
}
real_samples = []
fake_samples = []
with open(protocol_path, "r") as file:
for line in file:
attack_type = line.strip().split(" ")[3]
if attack_type == "-":
real_samples.append(line)
elif attack_type in self.allowed_attacks:
fake_samples.append(line)
if attack_type not in self.allowed_attacks:
continue
fake_samples = self.split_samples(fake_samples)
for line in fake_samples:
samples = self.add_line_to_samples(samples, line, subset_dir)
real_samples = self.split_samples(real_samples)
for line in real_samples:
samples = self.add_line_to_samples(samples, line, subset_dir)
return pd.DataFrame(samples)
@staticmethod
def add_line_to_samples(samples, line, subset_dir):
user_id, sample_name, _, attack_type, label = line.strip().split(" ")
samples["user_id"].append(user_id)
samples["sample_name"].append(sample_name)
samples["attack_type"].append(attack_type)
samples["label"].append(label)
assert (subset_dir / "flac" / f"{sample_name}.flac").exists()
samples["path"].append(subset_dir / "flac" / f"{sample_name}.flac")
return samples
class ASVSpoof2019DatasetOriginal(ASVSpoofDataset):
subsets = {"train": "train", "test": "dev", "val": "eval"}
protocol_folder_name = "ASVspoof2019_LA_cm_protocols"
subset_dir_prefix = "ASVspoof2019_LA_"
subset_dirs_attacks = {
"train": ["A01", "A02", "A03", "A04", "A05", "A06"],
"dev": ["A01", "A02", "A03", "A04", "A05", "A06"],
"eval": [
"A07", "A08", "A09", "A10", "A11", "A12", "A13", "A14", "A15",
"A16", "A17", "A18", "A19"
]
}
def __init__(self, path, fold_subset="train"):
"""
Initialise object. Skip __init__ of ASVSpoofDataset doe to different
logic, but follow SimpleAudioFakeDataset constructor.
"""
super(ASVSpoofDataset, self).__init__(float('inf'), fold_subset)
self.path = path
subset = self.subsets[fold_subset]
self.allowed_attacks = self.subset_dirs_attacks[subset]
subset_dir = Path(self.path) / f"{self.subset_dir_prefix}{subset}"
subset_protocol_path = self.get_protocol_path(subset)
self.samples = self.read_protocol(subset_dir, subset_protocol_path)
def read_protocol(self, subset_dir, protocol_path):
samples = {
"user_id": [],
"sample_name": [],
"attack_type": [],
"label": [],
"path": []
}
real_samples = []
fake_samples = []
with open(protocol_path, "r") as file:
for line in file:
attack_type = line.strip().split(" ")[3]
if attack_type == "-":
real_samples.append(line)
elif attack_type in self.allowed_attacks:
fake_samples.append(line)
else:
raise ValueError(
"Tried to load attack that shouldn't be here!"
)
for line in fake_samples:
samples = self.add_line_to_samples(samples, line, subset_dir)
for line in real_samples:
samples = self.add_line_to_samples(samples, line, subset_dir)
return pd.DataFrame(samples)