|
from flask import Flask, request, jsonify |
|
import torch |
|
import torch.nn as nn |
|
import torch.nn.functional as F |
|
from torch_geometric.data import HeteroData |
|
import numpy as np |
|
import pandas as pd |
|
import networkx as nx |
|
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve |
|
from sklearn.model_selection import train_test_split |
|
from pathlib import Path |
|
from datetime import datetime |
|
from loguru import logger |
|
from huggingface_hub import hf_hub_download |
|
import json |
|
from preprocessing_test import Preprocessor |
|
from src.model import * |
|
from main import start_pipelines |
|
|
|
app = Flask(__name__) |
|
|
|
|
|
default_values = { |
|
'review_id': 'KU_O5udG6zpxOg-VcAEodg', |
|
'user_id': 'mh_-eMZ6K5RLWhZyISBhwA', |
|
'business_id': 'XQfwVwDr-v0ZS3_CbbE5Xw', |
|
'review_stars': 0, |
|
'review_useful': 0, |
|
'review_funny': 0, |
|
'review_cool': 0, |
|
'review_text': 'It was a moderate experience', |
|
'review_date': 1531001351000, |
|
'business_name': 'Coffe at LA', |
|
'address': '1460 LA', |
|
'city': 'LA', |
|
'state': 'CA', |
|
'postal_code': '00000', |
|
'latitude': 0.0, |
|
'longitude': 0.0, |
|
'business_stars': 0.0, |
|
'business_review_count': 0, |
|
'is_open': 0, |
|
'attributes': '{}', |
|
'categories': 'Restaurants', |
|
'hours': '{"Monday": "7:0-20:0", "Tuesday": "7:0-20:0", "Wednesday": "7:0-20:0", "Thursday": "7:0-20:0", "Friday": "7:0-21:0", "Saturday": "7:0-21:0", "Sunday": "7:0-21:0"}', |
|
'user_name': 'default_user', |
|
'user_review_count': 0, |
|
'yelping_since': '2023-01-01 00:00:00', |
|
'user_useful': 0, |
|
'user_funny': 0, |
|
'user_cool': 0, |
|
'elite': '2024,2025', |
|
'friends': '', |
|
'fans': 0, |
|
'average_stars': 0.0, |
|
'compliment_hot': 0, |
|
'compliment_more': 0, |
|
'compliment_profile': 0, |
|
'compliment_cute': 0, |
|
'compliment_list': 0, |
|
'compliment_note': 0, |
|
'compliment_plain': 0, |
|
'compliment_cool': 0, |
|
'compliment_funny': 0, |
|
'compliment_writer': 0, |
|
'compliment_photos': 0, |
|
'checkin_date': '2023-01-01 00:00:00', |
|
'tip_compliment_count': 0.0, |
|
'tip_count': 0.0 |
|
} |
|
|
|
|
|
expected_types = { |
|
'review_id': str, |
|
'user_id': str, |
|
'business_id': str, |
|
'review_stars': int, |
|
'review_useful': int, |
|
'review_funny': int, |
|
'review_cool': int, |
|
'review_text': str, |
|
'review_date': int, |
|
'business_name': str, |
|
'address': str, |
|
'city': str, |
|
'state': str, |
|
'postal_code': str, |
|
'latitude': float, |
|
'longitude': float, |
|
'business_stars': float, |
|
'business_review_count': int, |
|
'is_open': int, |
|
'attributes': dict, |
|
'categories': str, |
|
'hours': dict, |
|
'user_name': str, |
|
'user_review_count': int, |
|
'yelping_since': str, |
|
'user_useful': int, |
|
'user_funny': int, |
|
'user_cool': int, |
|
'elite': str, |
|
'friends': str, |
|
'fans': int, |
|
'average_stars': float, |
|
'compliment_hot': int, |
|
'compliment_more': int, |
|
'compliment_profile': int, |
|
'compliment_cute': int, |
|
'compliment_list': int, |
|
'compliment_note': int, |
|
'compliment_plain': int, |
|
'compliment_cool': int, |
|
'compliment_funny': int, |
|
'compliment_writer': int, |
|
'compliment_photos': int, |
|
'checkin_date': str, |
|
'tip_compliment_count': float, |
|
'tip_count': float |
|
} |
|
|
|
@app.route('/predict', methods=['POST']) |
|
def predict(): |
|
try: |
|
|
|
if not request.json: |
|
return jsonify({'error': 'Request must contain JSON data'}), 400 |
|
|
|
data = request.json |
|
|
|
|
|
train = data.get('train', False) |
|
test = data.get('test', False) |
|
train_size = float(data.get('train_size', 0.1)) |
|
|
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
|
|
|
|
|
if train in (True, 'true', 'True'): |
|
start_pipelines(train_size=train_size) |
|
logger.info("PIPELINES FINISHED SUCCESSFULLY") |
|
return jsonify({ |
|
'message': 'Training pipelines executed successfully', |
|
'train_size': train_size |
|
}), 200 |
|
|
|
|
|
elif test in (True, 'test', 'True'): |
|
REPO_ID = "Askhedi/graphformermodel" |
|
MODEL_FILENAME = "model_GraphformerModel_latest.pth" |
|
model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILENAME) |
|
|
|
|
|
model = HeteroGraphormer(hidden_dim=64, output_dim=1, edge_dim=4).to(device) |
|
model.load_state_dict(torch.load(model_path, map_location=device)) |
|
model.eval() |
|
|
|
|
|
row = {} |
|
warnings = [] |
|
for col, expected_type in expected_types.items(): |
|
value = data.get(col, default_values[col]) |
|
try: |
|
if value == "" or value is None: |
|
row[col] = default_values[col] |
|
elif col in ['attributes', 'hours']: |
|
|
|
if isinstance(value, str): |
|
parsed = json.loads(value) |
|
if not isinstance(parsed, dict): |
|
raise ValueError |
|
row[col] = value |
|
else: |
|
raise ValueError |
|
else: |
|
row[col] = expected_type(value) |
|
except (ValueError, TypeError, json.JSONDecodeError): |
|
row[col] = default_values[col] |
|
warnings.append(f"Invalid input for '{col}' (expected {expected_type.__name__}), using default value: {default_values[col]}") |
|
|
|
|
|
for col in ['attributes', 'hours']: |
|
if isinstance(row[col], dict): |
|
row[col] = json.dumps(row[col]) |
|
|
|
|
|
input_df = pd.DataFrame([row]) |
|
|
|
|
|
preprocessor = Preprocessor(input_df) |
|
processed_df = preprocessor.run_pipeline() |
|
logger.info(f"PREPROCESSING COMPLETED VALUES ARE {processed_df}") |
|
|
|
|
|
num_users = 1 |
|
num_businesses = 1 |
|
num_rows = 1 |
|
|
|
graph = HeteroData() |
|
features = torch.tensor(processed_df.drop(columns=['user_id', 'review_id', 'business_id']).values, dtype=torch.float, device=device) |
|
time_since_user = torch.tensor(processed_df['time_since_last_review_user'].values, dtype=torch.float, device=device) |
|
time_since_business = torch.tensor(processed_df['time_since_last_review_business'].values, dtype=torch.float, device=device) |
|
|
|
user_indices = torch.tensor([0], dtype=torch.long, device=device) |
|
business_indices = torch.tensor([0], dtype=torch.long, device=device) |
|
review_indices = torch.tensor([0], dtype=torch.long, device=device) |
|
|
|
user_feats = torch.zeros(num_users, 14, device=device) |
|
business_feats = torch.zeros(num_businesses, 8, device=device) |
|
review_feats = torch.zeros(num_rows, 16, device=device) |
|
|
|
user_feats[0] = features[0, :14] |
|
business_feats[0] = features[0, 14:22] |
|
review_feats[0] = features[0, 22:38] |
|
|
|
graph['user'].x = user_feats |
|
graph['business'].x = business_feats |
|
graph['review'].x = review_feats |
|
|
|
graph['user', 'writes', 'review'].edge_index = torch.stack([user_indices, review_indices], dim=0) |
|
graph['review', 'about', 'business'].edge_index = torch.stack([review_indices, business_indices], dim=0) |
|
|
|
|
|
G = nx.DiGraph() |
|
node_type_map = {0: 'user', 1: 'business', 2: 'review'} |
|
G.add_nodes_from([0, 1, 2]) |
|
G.add_edge(0, 2) |
|
G.add_edge(2, 1) |
|
|
|
num_nodes = 3 |
|
spatial_encoding = torch.full((num_nodes, num_nodes), float('inf'), device=device) |
|
for i in range(num_nodes): |
|
for j in range(num_nodes): |
|
if i == j: |
|
spatial_encoding[i, j] = 0 |
|
elif nx.has_path(G, i, j): |
|
spatial_encoding[i, j] = nx.shortest_path_length(G, i, j) |
|
|
|
centrality_encoding = torch.tensor([G.degree(i) for i in range(num_nodes)], dtype=torch.float, device=device).view(-1, 1) |
|
|
|
edge_features_dict = {} |
|
user_writes_edge = graph['user', 'writes', 'review'].edge_index |
|
review_about_edge = graph['review', 'about', 'business'].edge_index |
|
|
|
edge_features_dict[('user', 'writes', 'review')] = create_temporal_edge_features( |
|
time_since_user[user_writes_edge[0]], time_since_user[user_writes_edge[1]], |
|
user_indices[user_writes_edge[0]], user_indices[user_writes_edge[0]] |
|
) |
|
edge_features_dict[('review', 'about', 'business')] = create_temporal_edge_features( |
|
time_since_business[review_about_edge[0]], time_since_business[review_about_edge[1]], |
|
torch.zeros_like(review_about_edge[0]), torch.zeros_like(review_about_edge[0]) |
|
) |
|
|
|
time_since_dict = { |
|
'user': torch.tensor([time_since_user[0]], dtype=torch.float, device=device) |
|
} |
|
|
|
|
|
with torch.no_grad(): |
|
out = model(graph, spatial_encoding, centrality_encoding, node_type_map, time_since_dict, edge_features_dict) |
|
pred_label = 1 if out.squeeze().item() > 0.5 else 0 |
|
prob = out.squeeze().item() |
|
|
|
|
|
result = { |
|
'warnings': warnings, |
|
'prediction': 'Fake' if pred_label == 1 else 'Not Fake', |
|
'probability': float(prob) |
|
} |
|
return jsonify(result), 200 |
|
|
|
else: |
|
return jsonify({ |
|
'error': 'Either "train" or "test" must be set to true' |
|
}), 400 |
|
|
|
except Exception as e: |
|
return jsonify({'error': str(e)}), 500 |
|
|
|
|