Spaces:
Sleeping
Sleeping
| import torch | |
| from transformers import AutoModelForSequenceClassification, AutoTokenizer | |
| from datasets import load_dataset | |
| from torch.utils.data import Dataset, DataLoader | |
| import os | |
| # 加载数据集 | |
| datasets = [ | |
| "Johnson8187/Chinese_Multi-Emotion_Dialogue_Dataset", | |
| "clapAI/MultiLingualSentiment", | |
| "shareAI/ShareGPT-Chinese-English-90k", | |
| "wikimedia/wikipedia", | |
| "google/code_x_glue_tt_text_to_text", | |
| "silk-road/ChatHaruhi-54K-Role-Playing-Dialogue", | |
| "yentinglin/TaiwanChat", | |
| "liswei/rm-static-zhTW", | |
| "yys/OpenOrca-Chinese", | |
| "Fumika/Wikinews-multilingual", | |
| "aqweteddy/Taiwan-Curlture-MCQ", | |
| "Nexdata/Chinese_Mandarin_Multi-emotional_Synthesis_Corpus", | |
| "Nexdata/Chinese_Mandarin_Entertainment_anchor_Style_Multi-emotional_Synthesis_Corpus", | |
| "voices365/102_Hours_High_Quality_Chinese_Audio_Dataset_For_Speech_Synthesis_Female_Samples", | |
| "voices365/Chinese_Female_001VoiceArtist_40Hours_High_Quality_Voice_Dataset", | |
| "Nexdata/Mandarin_Spontaneous_Speech_Data", | |
| "speechbrain/common_language", | |
| "hello2mao/Chinese_Audio_Resource" | |
| ] | |
| # 加载模型和tokenizer | |
| model = AutoModelForSequenceClassification.from_pretrained("zeroMN/zeroSG") | |
| tokenizer = AutoTokenizer.from_pretrained("zeroMN/zeroSG") | |
| # 创建数据加载器 | |
| class MyDataset(Dataset): | |
| def __init__(self, datasets): | |
| self.datasets = datasets | |
| self.data = [] | |
| for dataset in datasets: | |
| data = load_dataset(dataset) | |
| self.data.extend(data["train"]) | |
| def __len__(self): | |
| return len(self.data) | |
| def __getitem__(self, idx): | |
| text = self.data[idx]["text"] | |
| inputs = tokenizer.encode_plus( | |
| text, | |
| add_special_tokens=True, | |
| max_length=512, | |
| return_attention_mask=True, | |
| return_tensors='pt' | |
| ) | |
| return { | |
| 'input_ids': inputs['input_ids'].flatten(), | |
| 'attention_mask': inputs['attention_mask'].flatten(), | |
| 'labels': torch.tensor(0) # placeholder for labels | |
| } | |
| dataset = MyDataset(datasets) | |
| data_loader = DataLoader(dataset, batch_size=32, shuffle=True) | |
| # 训练模型 | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| model.to(device) | |
| criterion = torch.nn.CrossEntropyLoss() | |
| optimizer = torch.optim.Adam(model.parameters(), lr=1e-5) | |
| for epoch in range(5): | |
| model.train() | |
| total_loss = 0 | |
| for batch in data_loader: | |
| input_ids = batch['input_ids'].to(device) | |
| attention_mask = batch['attention_mask'].to(device) | |
| labels = batch['labels'].to(device) | |
| optimizer.zero_grad() | |
| outputs = model(input_ids, attention_mask=attention_mask, labels=labels) | |
| loss = criterion(outputs, labels) | |
| loss.backward() | |
| optimizer.step() | |
| total_loss += loss.item() | |
| print(f'Epoch {epoch+1}, Loss: {total_loss / len(data_loader)}') |