Spaces:
Sleeping
Sleeping
File size: 5,034 Bytes
7da6612 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
import pandas as pd
import torch
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling
)
from datasets import Dataset
import json
from pathlib import Path
class BankingModelTrainer:
def __init__(
self,
base_model_name="meta-llama/Llama-2-13b-chat-hf",
output_dir="./fine_tuned_model",
max_length=512
):
self.base_model_name = base_model_name
self.output_dir = Path(output_dir)
self.max_length = max_length
self.device = "cuda" if torch.cuda.is_available() else "cpu"
# تنظیمات مدل Llama-2
model_config = {
"device_map": "auto",
"torch_dtype": torch.bfloat16,
"low_cpu_mem_usage": True,
"max_memory": {0: "10GB"},
"load_in_8bit": True
}
# تنظیمات اولیه مدل و توکنایزر
self.tokenizer = AutoTokenizer.from_pretrained(base_model_name)
self.model = AutoModelForCausalLM.from_pretrained(
base_model_name,
**model_config
)
def prepare_data(self, data_path):
# خواندن دیتا از فایل
if data_path.endswith('.csv'):
df = pd.read_csv(data_path)
elif data_path.endswith('.json'):
with open(data_path, 'r', encoding='utf-8') as f:
data = json.load(f)
df = pd.DataFrame(data)
else:
raise ValueError("فرمت فایل باید CSV یا JSON باشد")
# پردازش و آمادهسازی دیتا
def prepare_examples(examples):
conversations = []
for q, a in zip(examples['question'], examples['answer']):
# فرمت Llama-2 برای مکالمه
conv = f"[INST] {q} [/INST] {a}"
conversations.append(conv)
# توکنایز کردن با تنظیمات Llama-2
encodings = self.tokenizer(
conversations,
truncation=True,
padding=True,
max_length=self.max_length,
return_tensors="pt"
)
return encodings
dataset = Dataset.from_pandas(df)
tokenized_dataset = dataset.map(
prepare_examples,
batched=True,
remove_columns=dataset.column_names
)
return tokenized_dataset
def train(self, dataset, epochs=3, batch_size=4):
training_args = TrainingArguments(
output_dir=str(self.output_dir),
num_train_epochs=epochs,
per_device_train_batch_size=batch_size,
gradient_accumulation_steps=4,
save_steps=500,
logging_steps=100,
learning_rate=2e-5, # کاهش نرخ یادگیری برای Llama-2
warmup_steps=100,
fp16=True, # فعال کردن fp16 برای Llama-2
save_total_limit=2,
logging_dir=str(self.output_dir / "logs"),
gradient_checkpointing=True # فعال کردن gradient checkpointing
)
data_collator = DataCollatorForLanguageModeling(
tokenizer=self.tokenizer,
mlm=False
)
trainer = Trainer(
model=self.model,
args=training_args,
train_dataset=dataset,
data_collator=data_collator
)
trainer.train()
self.model.save_pretrained(self.output_dir)
self.tokenizer.save_pretrained(self.output_dir)
def generate_response(self, prompt):
# فرمت Llama-2 برای پرامپت
formatted_prompt = f"[INST] {prompt} [/INST]"
inputs = self.tokenizer.encode(
formatted_prompt,
return_tensors="pt"
).to(self.device)
outputs = self.model.generate(
inputs,
max_length=self.max_length,
num_return_sequences=1,
temperature=0.7,
top_p=0.9,
do_sample=True,
pad_token_id=self.tokenizer.eos_token_id,
repetition_penalty=1.2 # اضافه کردن جریمه تکرار
)
response = self.tokenizer.decode(
outputs[0],
skip_special_tokens=True
)
# حذف پرامپت از پاسخ
response = response.replace(formatted_prompt, "").strip()
return response
if __name__ == "__main__":
trainer = BankingModelTrainer()
dataset = trainer.prepare_data("banking_qa.json")
trainer.train(dataset)
response = trainer.generate_response("شرایط وام مسکن چیست؟")
print(response)
|