AlainDeLong's picture
update config
f531887
raw
history blame
11.8 kB
# src/predict.py
import os # To help build file paths correctly
import torch # PyTorch library, for tensors and model operations
from transformers import (
AutoModelForSequenceClassification,
AutoTokenizer,
) # Hugging Face stuff for models
# --- Configuration ---
# This is where our fine-tuned model and tokenizer files are stored
# Assuming 'fine_tuned_model' directory is inside 'src/' and next to this predict.py file
_SCRIPT_DIR = os.path.dirname(
os.path.abspath(__file__)
) # Gets the directory where this script is
MODEL_PATH = os.path.join(
_SCRIPT_DIR, "fine_tuned_model"
) # User confirmed this variable name and directory
print(f"DEBUG (predict.py): Model path set to: {MODEL_PATH}") # For checking the path
# --- Device Setup ---
# Check if a GPU is available, otherwise use CPU
# Using GPU makes predictions much faster!
if torch.cuda.is_available():
device = torch.device("cuda")
# Trying to get the name of the GPU, just for information
try:
gpu_name = torch.cuda.get_device_name(0)
print(f"INFO (predict.py): GPU is available ({gpu_name}), using CUDA.")
except Exception as e:
print(
f"INFO (predict.py): GPU is available, using CUDA. (Could not get GPU name: {e})"
)
else:
device = torch.device("cpu")
print(
"INFO (predict.py): GPU not available, using CPU. Predictions might be slower."
)
# --- Load Model and Tokenizer ---
# We load these once when the script (or module) is first loaded.
# This is much better than loading them every time we want to predict.
model = None
tokenizer = None
id2label_mapping = {0: "negative", 1: "neutral", 2: "positive"} # Default mapping
try:
print(f"INFO (predict.py): Loading model from {MODEL_PATH}...")
# Load the pre-trained model for sequence classification
# This should be the PyTorch RoBERTa model we fine-tuned
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
model.to(device) # Move the model to the GPU (or CPU if no GPU)
model.eval() # Set the model to evaluation mode (important for layers like Dropout)
print("INFO (predict.py): Model loaded successfully and set to evaluation mode.")
print(f"INFO (predict.py): Loading tokenizer from {MODEL_PATH}...")
# Load the tokenizer that matches the model
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
print("INFO (predict.py): Tokenizer loaded successfully.")
# Get the label mapping from the model's configuration
# This was saved during fine-tuning
if hasattr(model.config, "id2label") and model.config.id2label:
id2label_mapping = model.config.id2label
# Convert string keys from config.json to int if necessary
id2label_mapping = {int(k): v for k, v in id2label_mapping.items()}
print(
f"INFO (predict.py): Loaded id2label mapping from model config: {id2label_mapping}"
)
else:
print(
"WARN (predict.py): id2label not found in model config, using default mapping."
)
except FileNotFoundError:
print(f"--- CRITICAL ERROR (predict.py) ---")
print(f"Model or Tokenizer files NOT FOUND at the specified path: {MODEL_PATH}")
print(
f"Please ensure the '{os.path.basename(MODEL_PATH)}' directory exists at '{_SCRIPT_DIR}' and contains all necessary model files (pytorch_model.bin/model.safetensors, config.json, tokenizer files, etc.)."
)
# Keep model and tokenizer as None, so predict_sentiments can handle it
except Exception as e:
print(f"--- ERROR (predict.py) ---")
print(f"An unexpected error occurred loading model or tokenizer: {e}")
# Keep model and tokenizer as None
# --- Preprocessing Function ---
# Same function we used for training data to make sure inputs are consistent
def preprocess_tweet(text):
"""Replaces @user mentions and http links with placeholders."""
preprocessed_text = []
if text is None:
return "" # Handle None input
# Split text into parts by space
for t in text.split(" "):
if len(t) > 0: # Avoid processing empty parts from multiple spaces
t = "@user" if t.startswith("@") else t # Replace mentions
t = "http" if t.startswith("http") else t # Replace links
preprocessed_text.append(t)
return " ".join(preprocessed_text) # Put the parts back together
# --- Prediction Function (UPDATED to return probabilities) ---
def predict_sentiments(comment_list: list):
"""
Predicts sentiments for a list of comment strings.
Returns a list of dictionaries, each containing the predicted label
and the probabilities (scores) for each class.
e.g., [{'label': 'positive', 'scores': {'negative': 0.1, 'neutral': 0.2, 'positive': 0.7}}, ...]
"""
# Check if model and tokenizer are ready
if model is None or tokenizer is None:
print(
"ERROR (predict.py - predict_sentiments): Model or Tokenizer not loaded. Cannot predict."
)
# Return an error structure
return [{"label": "Error: Model not loaded", "scores": {}}] * len(comment_list)
if not comment_list: # Handle empty input list
return []
inference_batch_size = 64 # You can adjust this number based on performance/memory
print(
f"INFO (predict.py): Predicting sentiments for {len(comment_list)} comments in batches of {inference_batch_size}..."
)
all_results_list = [] # We'll collect results for all batches here
# --- Loop through the comment list in batches ---
try:
total_comments = len(comment_list)
# This loop goes from 0 to total_comments, jumping by inference_batch_size each time
for i in range(0, total_comments, inference_batch_size):
# Get the current slice of comments for this batch
batch_comments = comment_list[i : i + inference_batch_size]
# Just printing progress for long lists
current_batch_num = i // inference_batch_size + 1
total_batches = (
total_comments + inference_batch_size - 1
) // inference_batch_size
print(
f"DEBUG (predict.py): Processing batch {current_batch_num}/{total_batches}..."
)
# --- Process ONLY the current batch ---
# 1. Preprocess this specific batch
processed_batch = [preprocess_tweet(comment) for comment in batch_comments]
# 2. Tokenize this batch
# Tokenizer handles padding within this smaller batch
inputs = tokenizer(
processed_batch,
padding=True,
truncation=True,
return_tensors="pt",
max_length=(
tokenizer.model_max_length
if hasattr(tokenizer, "model_max_length")
and tokenizer.model_max_length
else 512
),
)
# 3. Move this batch's inputs to the device (GPU/CPU)
inputs = {k: v.to(device) for k, v in inputs.items()}
# 4. Make prediction for this batch - no need for gradients
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits # Raw scores from the model for this batch
# 5. Calculate probabilities and get predicted class IDs for this batch
probabilities_batch = torch.softmax(logits, dim=-1)
predicted_class_ids_batch = torch.argmax(probabilities_batch, dim=-1)
# 6. Move results back to CPU, convert to lists for easier looping
probs_list_batch = probabilities_batch.cpu().numpy().tolist()
ids_list_batch = predicted_class_ids_batch.cpu().numpy().tolist()
# 7. Format results for each comment in THIS batch
batch_results = []
for j in range(len(ids_list_batch)):
pred_id = ids_list_batch[j]
pred_label = id2label_mapping.get(
pred_id, "Unknown"
) # Map ID to label name
# Create the scores dictionary for this comment
pred_scores = {
label_name: probs_list_batch[j][label_id]
for label_id, label_name in id2label_mapping.items()
if 0
<= label_id
< probabilities_batch.shape[-1] # Safety check for index
}
# Add the result for this comment
batch_results.append({"label": pred_label, "scores": pred_scores})
# Add the results from this completed batch to our main list
all_results_list.extend(batch_results)
# --- Finished processing current batch ---
print(
f"INFO (predict.py): Finished processing all {len(all_results_list)} comments."
)
except Exception as e:
# Catch errors that might happen during the loop
print(f"--- ERROR (predict.py - predict_sentiments loop) ---")
print(
f"An error occurred during batch prediction (around comment index {i}): {e}"
)
import traceback
traceback.print_exc() # Print full error details to console
# Try to return results for processed batches + error messages for the rest
num_processed = len(all_results_list)
num_remaining = len(comment_list) - num_processed
# Add error indicators for comments that couldn't be processed
all_results_list.extend(
[{"label": "Error: Batch failed", "scores": {}}] * num_remaining
)
# Return the list containing results for all comments
return all_results_list
# --- Main block for testing this script directly (UPDATED to show scores) ---
if __name__ == "__main__":
print("\n--- Testing predict.py Script Directly ---")
if model and tokenizer:
sample_comments_for_testing = [
"This is an amazing movie, I loved it!",
"I'm not sure how I feel about this, it was okay.",
"Worst experience ever, would not recommend.",
"The food was alright, but the service was slow.",
"What a fantastic day! #blessed",
"I hate waiting in long lines.",
"@user Check out http this is cool.",
"Just a normal sentence, nothing special here.",
"",
"This new update is absolutely terrible and full of bugs.",
]
print("\nInput Comments for Direct Test:")
for i, c in enumerate(sample_comments_for_testing):
print(f"{i+1}. '{c}'")
# Get predictions (now a list of dictionaries)
prediction_results = predict_sentiments(sample_comments_for_testing)
print("\nPredicted Sentiments and Scores (Direct Test):")
# Loop through the results list
for i, (comment, result) in enumerate(
zip(sample_comments_for_testing, prediction_results)
):
print(f"{i+1}. Comment: '{comment}'")
# Format scores nicely for printing
scores_dict = result.get("scores", {})
formatted_scores = ", ".join(
[f"{name}: {score:.3f}" for name, score in scores_dict.items()]
)
print(f" -> Predicted Label: {result.get('label', 'N/A')}")
# Also print the raw scores dictionary
print(f" -> Scores: {{{formatted_scores}}}")
print("--- Direct Test Finished ---")
else:
print("ERROR (predict.py - main test): Model and/or tokenizer not loaded.")
print(
f"Please check the MODEL_PATH ('{MODEL_PATH}') and ensure model files are present."
)