Spaces:

ojs595
/

gen_predict

Sleeping

File size: 5,759 Bytes

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import gradio as gr
import pandas as pd
import io
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from sklearn.model_selection import train_test_split

# 모델과 토크나이저 로드
MODEL_NAME = "beomi/kcbert-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)  # 3개 클래스로 변경

# 데이터셋 클래스 정의
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        item = self.data.iloc[index]
        description = str(item['description'])
        label = item['label']

        encoding = self.tokenizer.encode_plus(
            description,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 훈련 데이터 준비 및 모델 훈련
def train_model():
    csv_data = """description,gender
"그는 축구를 정말 좋아하고, 근육질의 몸매를 가졌다.",남자
"그녀는 긴 머리를 가졌고, 분홍색 원피스를 입었다.",여자
"짧은 머리에 정장을 입은 그는 회의에 참석했다.",남자
"아름다운 목소리로 노래하는 그녀는 가수다.",여자
"그의 취미는 자동차 정비와 컴퓨터 게임이다.",남자
"그녀는 섬세한 손길로 아기 인형을 만들었다.",여자
"군대에서 막 제대한 그는 씩씩해 보였다.",남자
"그녀는 친구들과 수다 떠는 것을 좋아한다.",여자
"강력한 리더십으로 팀을 이끄는 모습이 인상적이었다.",남자
"자신이 직접 만든 쿠키를 주변에 나누어주곤 한다.",여자
"정민지",천사
"""
    
    data = pd.read_csv(io.StringIO(csv_data))
    # 3개 클래스로 라벨 변경: 남자=0, 여자=1, 천사=2
    data['label'] = data['gender'].apply(lambda x: 0 if x == '남자' else (1 if x == '여자' else 2))
    train_data, _ = train_test_split(data, test_size=0.2, random_state=42)
    
    train_dataset = CustomDataset(train_data, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=2)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    optimizer = AdamW(model.parameters(), lr=5e-5)
    
    print("모델 훈련 시작...")
    model.train()
    for epoch in range(3):
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch + 1} 완료")
    
    print("모델 훈련 완료!")

# 예측 함수
def predict_gender(text):
    if not text.strip():
        return "텍스트를 입력해주세요."
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)
        prediction = torch.argmax(outputs.logits, dim=1).flatten().item()
        confidence = probabilities[0][prediction].item()

    # 3개 클래스 매핑: 0=남자, 1=여자, 2=천사
    gender_map = {0: "남자", 1: "여자", 2: "천사"}
    gender = gender_map[prediction]
    
    return f"예측 성별: {gender} (신뢰도: {confidence:.2%})"

# 앱 시작 시 모델 훈련
print("앱 초기화 중...")
train_model()

# Gradio 인터페이스 생성
iface = gr.Interface(
    fn=predict_gender,
    inputs=gr.Textbox(
        lines=3, 
        placeholder="성별을 예측할 텍스트를 입력하세요.\n예: '그는 축구를 좋아하고 근육질이다.'",
        label="텍스트 입력"
    ),
    outputs=gr.Textbox(label="예측 결과"),
    title="🤖 AI 성별 예측기 (3분류)",
    description="입력된 텍스트를 바탕으로 성별을 예측합니다. (남자/여자/천사)",
    examples=[
        ["그는 축구를 정말 좋아하고, 근육질의 몸매를 가졌다."],
        ["그녀는 긴 머리를 가졌고, 분홍색 원피스를 입었다."],
        ["짧은 머리에 정장을 입은 그는 회의에 참석했다."],
        ["아름다운 목소리로 노래하는 그녀는 가수다."],
        ["그들은 책 읽기를 좋아하고 조용한 성격이다."],
        ["요리와 청소를 모두 잘하며 집안일을 도맡아 한다."]
    ],
    theme=gr.themes.Soft()
)

# 앱 실행
if __name__ == "__main__":
    iface.launch()