from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments from datasets import Dataset import requests pisyn = requests.get("https://raw.githubusercontent.com/Fixyres/FHeta/refs/heads/main/modules.json") data = [ {"question": "Какая твоя база данных модулей? И по какой базе ты ищешь все модули?", "answer": pisyn.text} ] tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased") dataset = Dataset.from_dict(data) def preprocess_function(examples): questions = examples["question"] answers = examples["answer"] inputs = tokenizer(questions, padding=True, truncation=True, return_tensors="pt") with tokenizer.as_target_tokenizer(): labels = tokenizer(answers, padding=True, truncation=True, return_tensors="pt") inputs["labels"] = labels["input_ids"] return inputs tokenized_datasets = dataset.map(preprocess_function, batched=True) training_args = TrainingArguments( output_dir="./results", num_train_epochs=3, per_device_train_batch_size=8, logging_dir="./logs", logging_steps=10, ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets, ) trainer.train() model.save_pretrained("./FHeta") tokenizer.save_pretrained("./FHeta") tokenizer = AutoTokenizer.from_pretrained("./FHeta") model = AutoModelForQuestionAnswering.from_pretrained("./FHeta") def get_answer(query): inputs = tokenizer(query, return_tensors="pt") outputs = model(**inputs) answer = tokenizer.decode(outputs["logits"][0], skip_special_tokens=True) return answer query = "Модуль FHeta" answer = get_answer(query) print(answer)