Spaces:

Canstralian
/

bert_password_sniffer

Running

File size: 3,590 Bytes

eae8d55
41f9a4d
 
 
eae8d55
41f9a4d
eae8d55
41f9a4d
 
535152f
41f9a4d
 
 
 
 
 
 
 
 
 
 
 
eae8d55
41f9a4d
 
eae8d55
41f9a4d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
535152f
41f9a4d
535152f
41f9a4d
 
 
 
eae8d55
41f9a4d
 
 
 
eae8d55
41f9a4d
 
535152f
41f9a4d
eae8d55
41f9a4d
 
 
535152f
41f9a4d
eae8d55
41f9a4d
eae8d55
 
 
535152f

import gradio as gr
from transformers import Trainer, TrainingArguments, BertForSequenceClassification, BertTokenizer
from datasets import load_dataset
from huggingface_hub import login
from huggingface_hub import InferenceClient
import torch

# Authenticate with Hugging Face
login()

# Load Dataset from Kaggle (you can change this to your specific Kaggle dataset)
# Example: Load a dataset related to password classification, or any text classification dataset
dataset = load_dataset("imdb")  # Replace with your own dataset, e.g., Kaggle dataset

# Load Tokenizer and Model
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Preprocess the Dataset
def preprocess_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

# Apply preprocessing to dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Split into training and evaluation datasets
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["test"]

# Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir="./logs",            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",     # evaluate each epoch
    save_strategy="epoch",           # save model each epoch
)

# Initialize Trainer
trainer = Trainer(
    model=model,                     # the instantiated 🤗 Transformers model to be trained
    args=training_args,              # training arguments, defined above
    train_dataset=train_dataset,     # training dataset
    eval_dataset=eval_dataset,       # evaluation dataset
)

# Train the Model
trainer.train()

# Save the Model and Tokenizer
model.save_pretrained("./password_sniffer_model")
tokenizer.save_pretrained("./password_sniffer_tokenizer")

# Load the fine-tuned model and tokenizer
model = BertForSequenceClassification.from_pretrained("./password_sniffer_model")
tokenizer = BertTokenizer.from_pretrained("./password_sniffer_tokenizer")

# Setup Hugging Face Inference Client
client = InferenceClient("password_sniffer_model")

def detect_passwords(text):
    """
    Detect potential passwords using the trained BERT model.
    """
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    predictions = torch.softmax(outputs.logits, dim=-1)
    predicted_class = torch.argmax(predictions, dim=-1).item()

    if predicted_class == 1:  # Assuming '1' represents potential password
        return "Potential password detected."
    else:
        return "No password detected."

# Gradio Interface
def respond(message, history, system_message, max_tokens, temperature, top_p):
    detected_passwords = detect_passwords(message)
    return detected_passwords

demo = gr.Interface(
    fn=respond,
    inputs=[
        gr.Textbox(value="You are a password detection chatbot.", label="System message"),
        gr.Textbox(value="Hello, your password might be 12345!", label="User input"),
    ],
    outputs="text",
)

if __name__ == "__main__":
    demo.launch()