|
|
|
|
|
import os |
|
import torch |
|
from transformers import ( |
|
AutoModelForSequenceClassification, |
|
AutoTokenizer, |
|
) |
|
|
|
|
|
|
|
|
|
|
|
_SCRIPT_DIR = os.path.dirname( |
|
os.path.abspath(__file__) |
|
) |
|
MODEL_PATH = os.path.join( |
|
_SCRIPT_DIR, "fine_tuned_model" |
|
) |
|
|
|
print(f"DEBUG (predict.py): Model path set to: {MODEL_PATH}") |
|
|
|
|
|
|
|
|
|
if torch.cuda.is_available(): |
|
device = torch.device("cuda") |
|
|
|
try: |
|
gpu_name = torch.cuda.get_device_name(0) |
|
print(f"INFO (predict.py): GPU is available ({gpu_name}), using CUDA.") |
|
except Exception as e: |
|
print( |
|
f"INFO (predict.py): GPU is available, using CUDA. (Could not get GPU name: {e})" |
|
) |
|
else: |
|
device = torch.device("cpu") |
|
print( |
|
"INFO (predict.py): GPU not available, using CPU. Predictions might be slower." |
|
) |
|
|
|
|
|
|
|
|
|
model = None |
|
tokenizer = None |
|
id2label_mapping = {0: "negative", 1: "neutral", 2: "positive"} |
|
|
|
try: |
|
print(f"INFO (predict.py): Loading model from {MODEL_PATH}...") |
|
|
|
|
|
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH) |
|
model.to(device) |
|
model.eval() |
|
print("INFO (predict.py): Model loaded successfully and set to evaluation mode.") |
|
|
|
print(f"INFO (predict.py): Loading tokenizer from {MODEL_PATH}...") |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) |
|
print("INFO (predict.py): Tokenizer loaded successfully.") |
|
|
|
|
|
|
|
if hasattr(model.config, "id2label") and model.config.id2label: |
|
id2label_mapping = model.config.id2label |
|
|
|
id2label_mapping = {int(k): v for k, v in id2label_mapping.items()} |
|
print( |
|
f"INFO (predict.py): Loaded id2label mapping from model config: {id2label_mapping}" |
|
) |
|
else: |
|
print( |
|
"WARN (predict.py): id2label not found in model config, using default mapping." |
|
) |
|
|
|
except FileNotFoundError: |
|
print(f"--- CRITICAL ERROR (predict.py) ---") |
|
print(f"Model or Tokenizer files NOT FOUND at the specified path: {MODEL_PATH}") |
|
print( |
|
f"Please ensure the '{os.path.basename(MODEL_PATH)}' directory exists at '{_SCRIPT_DIR}' and contains all necessary model files (pytorch_model.bin/model.safetensors, config.json, tokenizer files, etc.)." |
|
) |
|
|
|
except Exception as e: |
|
print(f"--- ERROR (predict.py) ---") |
|
print(f"An unexpected error occurred loading model or tokenizer: {e}") |
|
|
|
|
|
|
|
|
|
|
|
def preprocess_tweet(text): |
|
"""Replaces @user mentions and http links with placeholders.""" |
|
preprocessed_text = [] |
|
if text is None: |
|
return "" |
|
|
|
for t in text.split(" "): |
|
if len(t) > 0: |
|
t = "@user" if t.startswith("@") else t |
|
t = "http" if t.startswith("http") else t |
|
preprocessed_text.append(t) |
|
return " ".join(preprocessed_text) |
|
|
|
|
|
|
|
def predict_sentiments(comment_list: list): |
|
""" |
|
Predicts sentiments for a list of comment strings. |
|
Returns a list of dictionaries, each containing the predicted label |
|
and the probabilities (scores) for each class. |
|
e.g., [{'label': 'positive', 'scores': {'negative': 0.1, 'neutral': 0.2, 'positive': 0.7}}, ...] |
|
""" |
|
|
|
if model is None or tokenizer is None: |
|
print( |
|
"ERROR (predict.py - predict_sentiments): Model or Tokenizer not loaded. Cannot predict." |
|
) |
|
|
|
return [{"label": "Error: Model not loaded", "scores": {}}] * len(comment_list) |
|
|
|
if not comment_list: |
|
return [] |
|
|
|
inference_batch_size = 64 |
|
print( |
|
f"INFO (predict.py): Predicting sentiments for {len(comment_list)} comments in batches of {inference_batch_size}..." |
|
) |
|
|
|
all_results_list = [] |
|
|
|
|
|
try: |
|
total_comments = len(comment_list) |
|
|
|
for i in range(0, total_comments, inference_batch_size): |
|
|
|
batch_comments = comment_list[i : i + inference_batch_size] |
|
|
|
|
|
current_batch_num = i // inference_batch_size + 1 |
|
total_batches = ( |
|
total_comments + inference_batch_size - 1 |
|
) // inference_batch_size |
|
print( |
|
f"DEBUG (predict.py): Processing batch {current_batch_num}/{total_batches}..." |
|
) |
|
|
|
|
|
|
|
processed_batch = [preprocess_tweet(comment) for comment in batch_comments] |
|
|
|
|
|
|
|
inputs = tokenizer( |
|
processed_batch, |
|
padding=True, |
|
truncation=True, |
|
return_tensors="pt", |
|
max_length=( |
|
tokenizer.model_max_length |
|
if hasattr(tokenizer, "model_max_length") |
|
and tokenizer.model_max_length |
|
else 512 |
|
), |
|
) |
|
|
|
|
|
inputs = {k: v.to(device) for k, v in inputs.items()} |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
logits = outputs.logits |
|
|
|
|
|
probabilities_batch = torch.softmax(logits, dim=-1) |
|
predicted_class_ids_batch = torch.argmax(probabilities_batch, dim=-1) |
|
|
|
|
|
probs_list_batch = probabilities_batch.cpu().numpy().tolist() |
|
ids_list_batch = predicted_class_ids_batch.cpu().numpy().tolist() |
|
|
|
|
|
batch_results = [] |
|
for j in range(len(ids_list_batch)): |
|
pred_id = ids_list_batch[j] |
|
pred_label = id2label_mapping.get( |
|
pred_id, "Unknown" |
|
) |
|
|
|
pred_scores = { |
|
label_name: probs_list_batch[j][label_id] |
|
for label_id, label_name in id2label_mapping.items() |
|
if 0 |
|
<= label_id |
|
< probabilities_batch.shape[-1] |
|
} |
|
|
|
batch_results.append({"label": pred_label, "scores": pred_scores}) |
|
|
|
|
|
all_results_list.extend(batch_results) |
|
|
|
|
|
print( |
|
f"INFO (predict.py): Finished processing all {len(all_results_list)} comments." |
|
) |
|
|
|
except Exception as e: |
|
|
|
print(f"--- ERROR (predict.py - predict_sentiments loop) ---") |
|
print( |
|
f"An error occurred during batch prediction (around comment index {i}): {e}" |
|
) |
|
import traceback |
|
|
|
traceback.print_exc() |
|
|
|
num_processed = len(all_results_list) |
|
num_remaining = len(comment_list) - num_processed |
|
|
|
all_results_list.extend( |
|
[{"label": "Error: Batch failed", "scores": {}}] * num_remaining |
|
) |
|
|
|
|
|
return all_results_list |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
print("\n--- Testing predict.py Script Directly ---") |
|
if model and tokenizer: |
|
sample_comments_for_testing = [ |
|
"This is an amazing movie, I loved it!", |
|
"I'm not sure how I feel about this, it was okay.", |
|
"Worst experience ever, would not recommend.", |
|
"The food was alright, but the service was slow.", |
|
"What a fantastic day! #blessed", |
|
"I hate waiting in long lines.", |
|
"@user Check out http this is cool.", |
|
"Just a normal sentence, nothing special here.", |
|
"", |
|
"This new update is absolutely terrible and full of bugs.", |
|
] |
|
|
|
print("\nInput Comments for Direct Test:") |
|
for i, c in enumerate(sample_comments_for_testing): |
|
print(f"{i+1}. '{c}'") |
|
|
|
|
|
prediction_results = predict_sentiments(sample_comments_for_testing) |
|
|
|
print("\nPredicted Sentiments and Scores (Direct Test):") |
|
|
|
for i, (comment, result) in enumerate( |
|
zip(sample_comments_for_testing, prediction_results) |
|
): |
|
print(f"{i+1}. Comment: '{comment}'") |
|
|
|
scores_dict = result.get("scores", {}) |
|
formatted_scores = ", ".join( |
|
[f"{name}: {score:.3f}" for name, score in scores_dict.items()] |
|
) |
|
print(f" -> Predicted Label: {result.get('label', 'N/A')}") |
|
|
|
print(f" -> Scores: {{{formatted_scores}}}") |
|
print("--- Direct Test Finished ---") |
|
else: |
|
print("ERROR (predict.py - main test): Model and/or tokenizer not loaded.") |
|
print( |
|
f"Please check the MODEL_PATH ('{MODEL_PATH}') and ensure model files are present." |
|
) |
|
|