import torch
import torch.nn as nn
import torch.optim as optim

# Set the device to GPU if available, otherwise use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define the model architecture
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Net, self).__init__()
        self.hidden = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.output = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        hidden = self.hidden(x)
        relu = self.relu(hidden)
        output = self.output(relu)
        output = self.sigmoid(output)
        return output

# Define the hyperparameters
learning_rate = 0.01
num_epochs = 1000

# Create an instance of the model
model = Net(input_size=2, hidden_size=5, output_size=1)

# Convert the model to run on the device
model.to(device)

# Define the loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

# Load the training data
X_train = torch.tensor([[0, 0], [0, 1], [1, 0], [1, 1]]).float().to(device)
y_train = torch.tensor([[0], [1], [1], [0]]).float().to(device)

# Train the model
for epoch in range(num_epochs):
    # Forward pass
    outputs = model(X_train)
    loss = criterion(outputs, y_train)

    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Print the loss at every 100th epoch
    if (epoch + 1) % 100 == 0:
        print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch + 1, num_epochs, loss.item()))

# Save the trained model
torch.save(model.state_dict(), 'trained model.pt')