Initial setup with Longformer embedding feature
Browse files- .huggingface-space +8 -0
- Dockerfile +16 -0
- README.md +117 -6
- app.py +283 -0
- best_model.pth +3 -0
- network_test.py +125 -0
- requirements.txt +18 -0
- src/ProcessOneSingleCampaign.py +296 -0
- src/__pycache__/ProcessOneSingleCampaign.cpython-311.pyc +0 -0
- src/__pycache__/explainer.cpython-310.pyc +0 -0
- src/__pycache__/explainer.cpython-311.pyc +0 -0
- src/__pycache__/model.cpython-310.pyc +0 -0
- src/__pycache__/model.cpython-311.pyc +0 -0
- src/explainer.py +108 -0
- src/model.py +148 -0
- test.py +183 -0
.huggingface-space
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
title: Kickstarter Success Prediction
|
2 |
+
emoji: π
|
3 |
+
colorFrom: blue
|
4 |
+
colorTo: green
|
5 |
+
sdk: docker
|
6 |
+
sdk_version: "3.9"
|
7 |
+
app_file: app.py
|
8 |
+
pinned: false
|
Dockerfile
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9-slim
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
COPY . /app/
|
6 |
+
|
7 |
+
RUN apt-get update && \
|
8 |
+
apt-get install -y --no-install-recommends \
|
9 |
+
build-essential \
|
10 |
+
&& rm -rf /var/lib/apt/lists/*
|
11 |
+
|
12 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
13 |
+
|
14 |
+
EXPOSE 7860
|
15 |
+
|
16 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
@@ -1,12 +1,123 @@
|
|
1 |
---
|
2 |
-
title: Kickstarter Prediction
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: docker
|
|
|
|
|
7 |
pinned: false
|
8 |
-
license: mit
|
9 |
-
short_description: Prediction returned with Description Embedding
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: Kickstarter Success Prediction
|
3 |
+
emoji: π
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: green
|
6 |
sdk: docker
|
7 |
+
sdk_version: "3.9"
|
8 |
+
app_file: app.py
|
9 |
pinned: false
|
|
|
|
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
13 |
+
|
14 |
+
# Kickstarter Success Prediction API
|
15 |
+
|
16 |
+
This API predicts the success probability of Kickstarter campaigns using machine learning.
|
17 |
+
|
18 |
+
## API Usage
|
19 |
+
|
20 |
+
### Endpoint: `/predict`
|
21 |
+
|
22 |
+
Send a POST request with your campaign data in JSON format.
|
23 |
+
|
24 |
+
### Input Format
|
25 |
+
|
26 |
+
```json
|
27 |
+
{
|
28 |
+
"raw_description": "Detailed project description...",
|
29 |
+
"raw_blurb": "Short project summary...",
|
30 |
+
"raw_risks": "Project risks and challenges...",
|
31 |
+
"raw_category": "Technology",
|
32 |
+
"raw_subcategory": "Gadgets",
|
33 |
+
"raw_country": "Canada",
|
34 |
+
"description_length": 557,
|
35 |
+
"funding_goal": 58000,
|
36 |
+
"image_count": 8,
|
37 |
+
"video_count": 3,
|
38 |
+
"campaign_duration": 90,
|
39 |
+
"previous_projects_count": 5,
|
40 |
+
"previous_success_rate": 0.4,
|
41 |
+
"previous_pledged": 18745.33,
|
42 |
+
"previous_funding_goal": 23564.99
|
43 |
+
}
|
44 |
+
```
|
45 |
+
|
46 |
+
The system will use the provided numerical values directly if they exist in the input. If any numerical fields are missing, they will be calculated during preprocessing.
|
47 |
+
|
48 |
+
### Output Format
|
49 |
+
|
50 |
+
```json
|
51 |
+
{
|
52 |
+
"success_probability": 0.7532,
|
53 |
+
"predicted_outcome": "Success",
|
54 |
+
"shap_values": {
|
55 |
+
"funding_goal": -0.8991450071334839,
|
56 |
+
"description_embedding": -0.04273056983947754,
|
57 |
+
"subcategory_embedding": 0.011444330215454102,
|
58 |
+
"previous_funding_goal": -0.008600413799285889,
|
59 |
+
"video_count": 0.0037734508514404297,
|
60 |
+
...
|
61 |
+
},
|
62 |
+
"longformer_embedding": [0.0213, -0.0124, 0.0342, ..., 0.0547]
|
63 |
+
}
|
64 |
+
```
|
65 |
+
|
66 |
+
- `success_probability`: A value between 0 and 1 representing the likelihood of project success
|
67 |
+
- `predicted_outcome`: "Success" if probability β₯ 0.5, otherwise "Failure"
|
68 |
+
- `shap_values`: Contribution of each feature to the prediction (positive values increase success probability, negative values decrease it)
|
69 |
+
- `longformer_embedding`: The 768-dimensional vector generated by the Longformer model representing the semantic content of the project description (useful for further analysis or clustering)
|
70 |
+
|
71 |
+
## Example Usage with Python
|
72 |
+
|
73 |
+
```python
|
74 |
+
import requests
|
75 |
+
import json
|
76 |
+
import numpy as np
|
77 |
+
|
78 |
+
# API endpoint
|
79 |
+
api_url = "https://huggingface.co/spaces/angusfung/kickstarter-success-prediction/predict"
|
80 |
+
|
81 |
+
# Load your campaign data
|
82 |
+
campaign_data = {
|
83 |
+
"raw_description": "Introducing the AquaGo...",
|
84 |
+
"raw_blurb": "AquaGo is a smart, eco-friendly portable water purifier...",
|
85 |
+
"raw_risks": "Bringing a product to market involves...",
|
86 |
+
"raw_subcategory": "Gadgets",
|
87 |
+
"raw_category": "Technology",
|
88 |
+
"raw_country": "Canada",
|
89 |
+
"funding_goal": 2000,
|
90 |
+
"image_count": 8,
|
91 |
+
"video_count": 3
|
92 |
+
}
|
93 |
+
|
94 |
+
# Make prediction request
|
95 |
+
response = requests.post(api_url, json=campaign_data)
|
96 |
+
|
97 |
+
# Print results
|
98 |
+
if response.status_code == 200:
|
99 |
+
result = response.json()
|
100 |
+
print(f"Success Probability: {result['success_probability']:.2f}")
|
101 |
+
print(f"Predicted Outcome: {result['predicted_outcome']}")
|
102 |
+
print("\nTop 5 SHAP Values (Feature Importance):")
|
103 |
+
for i, (feature, value) in enumerate(list(result['shap_values'].items())[:5]):
|
104 |
+
print(f"{feature}: {value:.4f}")
|
105 |
+
|
106 |
+
# Access the longformer embedding if needed
|
107 |
+
if 'longformer_embedding' in result:
|
108 |
+
embedding = np.array(result['longformer_embedding'])
|
109 |
+
print(f"\nLongformer Embedding Shape: {embedding.shape}")
|
110 |
+
else:
|
111 |
+
print(f"Error: {response.status_code}")
|
112 |
+
print(response.text)
|
113 |
+
```
|
114 |
+
|
115 |
+
## Example Usage with cURL
|
116 |
+
|
117 |
+
```bash
|
118 |
+
curl -X POST "https://huggingface.co/spaces/angusfung/kickstarter-success-prediction/predict" \
|
119 |
+
-H "Content-Type: application/json" \
|
120 |
+
-d @campaign.json
|
121 |
+
```
|
122 |
+
|
123 |
+
Where `campaign.json` contains your campaign data in the format described above.
|
app.py
ADDED
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import torch
|
4 |
+
import numpy as np
|
5 |
+
import logging
|
6 |
+
from pathlib import Path
|
7 |
+
from contextlib import asynccontextmanager
|
8 |
+
from fastapi import FastAPI, HTTPException, Request, Response
|
9 |
+
from fastapi.responses import JSONResponse
|
10 |
+
from fastapi.middleware.cors import CORSMiddleware
|
11 |
+
|
12 |
+
from src.model import KickstarterModel
|
13 |
+
from src.explainer import KickstarterExplainer
|
14 |
+
from src.ProcessOneSingleCampaign import CampaignProcessor
|
15 |
+
|
16 |
+
# Configure logging
|
17 |
+
logging.basicConfig(
|
18 |
+
level=logging.INFO,
|
19 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
20 |
+
handlers=[logging.StreamHandler()]
|
21 |
+
)
|
22 |
+
logger = logging.getLogger(__name__)
|
23 |
+
|
24 |
+
# Allow numpy.core.multiarray.scalar to be loaded safely
|
25 |
+
try:
|
26 |
+
import numpy.core.multiarray
|
27 |
+
torch.serialization.add_safe_globals([numpy.core.multiarray.scalar])
|
28 |
+
logger.info("Added numpy.core.multiarray.scalar to safe globals")
|
29 |
+
except Exception as e:
|
30 |
+
logger.warning(f"Failed to add safe globals: {str(e)}")
|
31 |
+
|
32 |
+
# Constants
|
33 |
+
NUMERICAL_FIELDS = [
|
34 |
+
'description_length', 'funding_goal', 'image_count', 'video_count',
|
35 |
+
'campaign_duration', 'previous_projects_count', 'previous_success_rate',
|
36 |
+
'previous_pledged', 'previous_funding_goal'
|
37 |
+
]
|
38 |
+
|
39 |
+
EMBEDDING_NAMES = [
|
40 |
+
'description_embedding', 'blurb_embedding', 'risk_embedding',
|
41 |
+
'subcategory_embedding', 'category_embedding', 'country_embedding'
|
42 |
+
]
|
43 |
+
|
44 |
+
# Global variables to store the model and processor
|
45 |
+
model = None
|
46 |
+
explainer = None
|
47 |
+
processor = None
|
48 |
+
device = None
|
49 |
+
|
50 |
+
@asynccontextmanager
|
51 |
+
async def lifespan(app: FastAPI):
|
52 |
+
# Load resources on startup
|
53 |
+
global model, explainer, processor, device
|
54 |
+
|
55 |
+
logger.info("Starting application initialization...")
|
56 |
+
|
57 |
+
# Set device
|
58 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
59 |
+
logger.info(f"Using device: {device}")
|
60 |
+
|
61 |
+
# Create cache directories in /tmp which is writable
|
62 |
+
cache_dir = "/tmp/model_cache"
|
63 |
+
os.makedirs(cache_dir, exist_ok=True)
|
64 |
+
logger.info(f"Created cache directory at {cache_dir}")
|
65 |
+
|
66 |
+
# Set environment variables for model caching
|
67 |
+
os.environ["TRANSFORMERS_CACHE"] = cache_dir
|
68 |
+
os.environ["HF_HOME"] = cache_dir
|
69 |
+
|
70 |
+
# Load the CampaignProcessor with lazy loading
|
71 |
+
logger.info("Initializing CampaignProcessor...")
|
72 |
+
processor = CampaignProcessor(data=[], lazy_load=True)
|
73 |
+
|
74 |
+
# Load model with default parameters
|
75 |
+
model_path = "best_model.pth"
|
76 |
+
hidden_dim = 256
|
77 |
+
|
78 |
+
logger.info(f"Initializing KickstarterModel with hidden_dim={hidden_dim}...")
|
79 |
+
model = KickstarterModel(hidden_dim=hidden_dim)
|
80 |
+
|
81 |
+
if os.path.exists(model_path):
|
82 |
+
logger.info(f"Loading model weights from {model_path}...")
|
83 |
+
try:
|
84 |
+
# Using both approaches for maximum compatibility
|
85 |
+
# 1. Added safe globals above
|
86 |
+
# 2. Setting weights_only=False explicitly
|
87 |
+
checkpoint = torch.load(model_path, map_location=device, weights_only=False)
|
88 |
+
model.load_state_dict(checkpoint['model_state_dict'])
|
89 |
+
model.to(device)
|
90 |
+
model.eval() # Set model to evaluation mode
|
91 |
+
logger.info("Model loaded successfully!")
|
92 |
+
except Exception as e:
|
93 |
+
logger.error(f"Error loading model weights: {str(e)}")
|
94 |
+
logger.info("Continuing with uninitialized model weights.")
|
95 |
+
else:
|
96 |
+
logger.warning(f"Model file not found: {model_path}")
|
97 |
+
logger.info("Continuing with uninitialized model weights.")
|
98 |
+
|
99 |
+
# Initialize explainer
|
100 |
+
logger.info("Initializing KickstarterExplainer...")
|
101 |
+
explainer = KickstarterExplainer(model, device)
|
102 |
+
|
103 |
+
logger.info("Application initialization completed successfully!")
|
104 |
+
|
105 |
+
yield
|
106 |
+
|
107 |
+
# Clean up resources on shutdown
|
108 |
+
logger.info("Cleaning up resources...")
|
109 |
+
|
110 |
+
app = FastAPI(
|
111 |
+
title="Kickstarter Success Prediction API",
|
112 |
+
description="API for predicting the success of Kickstarter campaigns",
|
113 |
+
version="1.0.0",
|
114 |
+
lifespan=lifespan,
|
115 |
+
)
|
116 |
+
|
117 |
+
# Add CORS middleware
|
118 |
+
app.add_middleware(
|
119 |
+
CORSMiddleware,
|
120 |
+
allow_origins=["*"],
|
121 |
+
allow_credentials=True,
|
122 |
+
allow_methods=["*"],
|
123 |
+
allow_headers=["*"],
|
124 |
+
)
|
125 |
+
|
126 |
+
@app.get("/")
|
127 |
+
async def root():
|
128 |
+
return {
|
129 |
+
"message": "Kickstarter Success Prediction API",
|
130 |
+
"description": "Send a POST request to /predict with campaign data to get a prediction"
|
131 |
+
}
|
132 |
+
|
133 |
+
@app.post("/predict")
|
134 |
+
async def predict(request: Request):
|
135 |
+
try:
|
136 |
+
# Parse the incoming JSON data
|
137 |
+
logger.info("Received prediction request")
|
138 |
+
campaign_data = await request.json()
|
139 |
+
logger.info(f"Campaign data received: {json.dumps(campaign_data)[:100]}...")
|
140 |
+
|
141 |
+
# Process the campaign data
|
142 |
+
logger.info("Processing campaign data...")
|
143 |
+
processed_data = preprocess_raw_data(campaign_data)
|
144 |
+
logger.info("Campaign data processed successfully")
|
145 |
+
|
146 |
+
# Store the raw longformer embedding for returning in the response
|
147 |
+
raw_longformer_embedding = None
|
148 |
+
if 'description_embedding' in processed_data:
|
149 |
+
raw_longformer_embedding = processed_data['description_embedding']
|
150 |
+
|
151 |
+
# Process embeddings
|
152 |
+
logger.info("Preparing inputs for model...")
|
153 |
+
processed_inputs = {}
|
154 |
+
for embedding_name in EMBEDDING_NAMES:
|
155 |
+
if embedding_name in processed_data:
|
156 |
+
processed_inputs[embedding_name] = torch.tensor(processed_data[embedding_name], dtype=torch.float32).unsqueeze(0)
|
157 |
+
else:
|
158 |
+
# Use appropriate zero vector
|
159 |
+
dim = 768 if embedding_name == 'description_embedding' else \
|
160 |
+
384 if embedding_name in ['blurb_embedding', 'risk_embedding'] else \
|
161 |
+
100 if embedding_name in ['subcategory_embedding', 'country_embedding'] else 15
|
162 |
+
processed_inputs[embedding_name] = torch.zeros((1, dim), dtype=torch.float32)
|
163 |
+
logger.warning(f"Using zero tensor for missing embedding: {embedding_name}")
|
164 |
+
|
165 |
+
# Process numerical features
|
166 |
+
numerical_features = [processed_data.get(field, 0) for field in NUMERICAL_FIELDS]
|
167 |
+
processed_inputs['numerical_features'] = torch.tensor([numerical_features], dtype=torch.float32)
|
168 |
+
|
169 |
+
# Predict and explain
|
170 |
+
logger.info("Running prediction and generating explanations...")
|
171 |
+
prediction, shap_values = explainer.explain_prediction(processed_inputs)
|
172 |
+
logger.info(f"Prediction completed: {float(prediction):.4f}")
|
173 |
+
|
174 |
+
# Sort SHAP values by absolute magnitude
|
175 |
+
sorted_shap = dict(sorted(shap_values.items(), key=lambda x: abs(x[1]), reverse=True))
|
176 |
+
|
177 |
+
# Return the results
|
178 |
+
result = {
|
179 |
+
"success_probability": float(prediction),
|
180 |
+
"predicted_outcome": "Success" if prediction >= 0.5 else "Failure",
|
181 |
+
"shap_values": {k: float(v) for k, v in sorted_shap.items()}
|
182 |
+
}
|
183 |
+
|
184 |
+
# Add raw longformer embedding to result if available
|
185 |
+
if raw_longformer_embedding is not None:
|
186 |
+
result["longformer_embedding"] = raw_longformer_embedding
|
187 |
+
|
188 |
+
logger.info("Returning prediction results")
|
189 |
+
return JSONResponse(content=result)
|
190 |
+
|
191 |
+
except Exception as e:
|
192 |
+
logger.error(f"Error during prediction: {str(e)}", exc_info=True)
|
193 |
+
raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}")
|
194 |
+
|
195 |
+
def preprocess_raw_data(campaign_data):
|
196 |
+
"""Preprocess raw data using CampaignProcessor"""
|
197 |
+
try:
|
198 |
+
# Process the single campaign
|
199 |
+
logger.info("Processing campaign with CampaignProcessor...")
|
200 |
+
processed_data = processor.process_campaign(campaign_data, idx=0)
|
201 |
+
|
202 |
+
# Preserve existing numerical values from input if present
|
203 |
+
for field in NUMERICAL_FIELDS:
|
204 |
+
if field in campaign_data:
|
205 |
+
processed_data[field] = campaign_data[field]
|
206 |
+
logger.info(f"Using provided value for {field}: {campaign_data[field]}")
|
207 |
+
|
208 |
+
return processed_data
|
209 |
+
|
210 |
+
except Exception as e:
|
211 |
+
logger.error(f"Error preprocessing raw data: {str(e)}", exc_info=True)
|
212 |
+
raise Exception(f"Error preprocessing raw data: {str(e)}")
|
213 |
+
|
214 |
+
# Debugging endpoint to check the environment and loaded resources
|
215 |
+
@app.get("/debug")
|
216 |
+
async def debug():
|
217 |
+
"""Endpoint for checking the status of the API and its components"""
|
218 |
+
global model, explainer, processor, device
|
219 |
+
|
220 |
+
# Check internet connectivity
|
221 |
+
internet_check = {"status": "unknown", "message": ""}
|
222 |
+
try:
|
223 |
+
import requests
|
224 |
+
response = requests.get("https://huggingface.co", timeout=5)
|
225 |
+
internet_check = {
|
226 |
+
"status": "connected" if response.status_code == 200 else "error",
|
227 |
+
"status_code": response.status_code,
|
228 |
+
"message": "Successfully connected to huggingface.co"
|
229 |
+
}
|
230 |
+
except Exception as e:
|
231 |
+
internet_check = {"status": "error", "message": f"Error connecting to internet: {str(e)}"}
|
232 |
+
|
233 |
+
# Try to load the tokenizer directly as a test
|
234 |
+
tokenizer_check = {"status": "unknown", "message": ""}
|
235 |
+
try:
|
236 |
+
from transformers import AutoTokenizer
|
237 |
+
cache_dir = "/tmp/model_cache"
|
238 |
+
os.makedirs(cache_dir, exist_ok=True)
|
239 |
+
test_model_name = "allenai/longformer-base-4096"
|
240 |
+
tokenizer = AutoTokenizer.from_pretrained(test_model_name, cache_dir=cache_dir)
|
241 |
+
tokenizer_check = {"status": "success", "message": f"Successfully loaded {test_model_name} tokenizer"}
|
242 |
+
except Exception as e:
|
243 |
+
tokenizer_check = {"status": "error", "message": f"Error loading tokenizer: {str(e)}"}
|
244 |
+
|
245 |
+
# Check disk space
|
246 |
+
disk_space = {"status": "unknown", "message": ""}
|
247 |
+
try:
|
248 |
+
import shutil
|
249 |
+
total, used, free = shutil.disk_usage("/tmp")
|
250 |
+
disk_space = {
|
251 |
+
"status": "ok",
|
252 |
+
"total_gb": total / (1024**3),
|
253 |
+
"used_gb": used / (1024**3),
|
254 |
+
"free_gb": free / (1024**3),
|
255 |
+
"percent_used": (used / total) * 100
|
256 |
+
}
|
257 |
+
except Exception as e:
|
258 |
+
disk_space = {"status": "error", "message": f"Error checking disk space: {str(e)}"}
|
259 |
+
|
260 |
+
debug_info = {
|
261 |
+
"api_status": "running",
|
262 |
+
"device": str(device),
|
263 |
+
"model_loaded": model is not None,
|
264 |
+
"explainer_loaded": explainer is not None,
|
265 |
+
"processor_loaded": processor is not None,
|
266 |
+
"cuda_available": torch.cuda.is_available(),
|
267 |
+
"environment_variables": {
|
268 |
+
"TRANSFORMERS_CACHE": os.environ.get("TRANSFORMERS_CACHE", "Not set"),
|
269 |
+
"HF_HOME": os.environ.get("HF_HOME", "Not set"),
|
270 |
+
},
|
271 |
+
"model_cache_exists": os.path.exists("/tmp/model_cache"),
|
272 |
+
"model_file_exists": os.path.exists("best_model.pth"),
|
273 |
+
"tmp_directory_writable": os.access("/tmp", os.W_OK),
|
274 |
+
"internet_connectivity": internet_check,
|
275 |
+
"tokenizer_test": tokenizer_check,
|
276 |
+
"disk_space": disk_space
|
277 |
+
}
|
278 |
+
|
279 |
+
return JSONResponse(content=debug_info)
|
280 |
+
|
281 |
+
if __name__ == "__main__":
|
282 |
+
import uvicorn
|
283 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
best_model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c2d55ef67179ea54e94d958795cd79b7d29e8605f9e7c135d34d5dc56079e41f
|
3 |
+
size 6324090
|
network_test.py
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import requests
|
3 |
+
import json
|
4 |
+
import time
|
5 |
+
from pprint import pprint
|
6 |
+
|
7 |
+
# Test internet connectivity
|
8 |
+
def check_internet():
|
9 |
+
print("\n=== TESTING INTERNET CONNECTIVITY ===")
|
10 |
+
try:
|
11 |
+
urls = [
|
12 |
+
"https://huggingface.co",
|
13 |
+
"https://google.com",
|
14 |
+
"https://huggingface.co/allenai/longformer-base-4096"
|
15 |
+
]
|
16 |
+
|
17 |
+
for url in urls:
|
18 |
+
try:
|
19 |
+
print(f"Testing connection to {url}...")
|
20 |
+
start_time = time.time()
|
21 |
+
response = requests.get(url, timeout=10)
|
22 |
+
elapsed = time.time() - start_time
|
23 |
+
print(f" Status: {response.status_code}, Time: {elapsed:.2f}s")
|
24 |
+
except Exception as e:
|
25 |
+
print(f" Error: {str(e)}")
|
26 |
+
except Exception as e:
|
27 |
+
print(f"Network test failed: {str(e)}")
|
28 |
+
|
29 |
+
# Test model download
|
30 |
+
def test_model_download():
|
31 |
+
print("\n=== TESTING MODEL DOWNLOAD ===")
|
32 |
+
try:
|
33 |
+
from transformers import AutoTokenizer
|
34 |
+
|
35 |
+
# Create cache directory
|
36 |
+
cache_dir = "/tmp/model_cache_test"
|
37 |
+
os.makedirs(cache_dir, exist_ok=True)
|
38 |
+
print(f"Created test cache directory at {cache_dir}")
|
39 |
+
|
40 |
+
# Try to download a model
|
41 |
+
model_name = "distilbert-base-uncased" # Smaller model for testing
|
42 |
+
print(f"Trying to download {model_name}...")
|
43 |
+
|
44 |
+
start_time = time.time()
|
45 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
|
46 |
+
elapsed = time.time() - start_time
|
47 |
+
|
48 |
+
print(f"Successfully downloaded tokenizer in {elapsed:.2f}s")
|
49 |
+
print(f"Tokenizer type: {type(tokenizer).__name__}")
|
50 |
+
|
51 |
+
# Check if files were created
|
52 |
+
if os.path.exists(cache_dir):
|
53 |
+
files = os.listdir(cache_dir)
|
54 |
+
print(f"Files in cache directory: {len(files)}")
|
55 |
+
if len(files) > 0:
|
56 |
+
print(f"Sample files: {files[:5]}")
|
57 |
+
except Exception as e:
|
58 |
+
print(f"Model download test failed: {str(e)}")
|
59 |
+
|
60 |
+
# Check disk space
|
61 |
+
def check_disk_space():
|
62 |
+
print("\n=== CHECKING DISK SPACE ===")
|
63 |
+
try:
|
64 |
+
import shutil
|
65 |
+
|
66 |
+
# Check disk space in various directories
|
67 |
+
directories = ["/tmp", "/", "/home"]
|
68 |
+
|
69 |
+
for directory in directories:
|
70 |
+
if os.path.exists(directory):
|
71 |
+
try:
|
72 |
+
total, used, free = shutil.disk_usage(directory)
|
73 |
+
print(f"Disk space for {directory}:")
|
74 |
+
print(f" Total: {total / (1024**3):.2f} GB")
|
75 |
+
print(f" Used: {used / (1024**3):.2f} GB")
|
76 |
+
print(f" Free: {free / (1024**3):.2f} GB")
|
77 |
+
print(f" Percent used: {(used / total) * 100:.1f}%")
|
78 |
+
except Exception as e:
|
79 |
+
print(f" Error checking {directory}: {str(e)}")
|
80 |
+
except Exception as e:
|
81 |
+
print(f"Disk space check failed: {str(e)}")
|
82 |
+
|
83 |
+
# Test GloVe model loading
|
84 |
+
def test_glove_loading():
|
85 |
+
print("\n=== TESTING GLOVE MODEL LOADING ===")
|
86 |
+
try:
|
87 |
+
import gensim.downloader
|
88 |
+
import os
|
89 |
+
|
90 |
+
# Set gensim data directory to a writable location
|
91 |
+
gensim_dir = "/tmp/gensim-data"
|
92 |
+
os.environ['GENSIM_DATA_DIR'] = gensim_dir
|
93 |
+
os.makedirs(gensim_dir, exist_ok=True)
|
94 |
+
print(f"Set GENSIM_DATA_DIR to {gensim_dir}")
|
95 |
+
|
96 |
+
# Try to download GloVe
|
97 |
+
print("Trying to download GloVe model (this might take a while)...")
|
98 |
+
start_time = time.time()
|
99 |
+
glove = gensim.downloader.load('glove-wiki-gigaword-100')
|
100 |
+
elapsed = time.time() - start_time
|
101 |
+
|
102 |
+
print(f"Successfully loaded GloVe model in {elapsed:.2f}s")
|
103 |
+
print(f"GloVe model type: {type(glove).__name__}")
|
104 |
+
print(f"Vocabulary size: {len(glove.key_to_index)}")
|
105 |
+
|
106 |
+
# Check if files were created
|
107 |
+
if os.path.exists(gensim_dir):
|
108 |
+
files = os.listdir(gensim_dir)
|
109 |
+
print(f"Files in GloVe directory: {len(files)}")
|
110 |
+
if len(files) > 0:
|
111 |
+
print(f"Sample files: {files[:5]}")
|
112 |
+
except Exception as e:
|
113 |
+
print(f"GloVe model loading test failed: {str(e)}")
|
114 |
+
|
115 |
+
# Main function
|
116 |
+
if __name__ == "__main__":
|
117 |
+
print("Starting network and model download tests...")
|
118 |
+
|
119 |
+
# Run tests
|
120 |
+
check_internet()
|
121 |
+
test_model_download()
|
122 |
+
test_glove_loading()
|
123 |
+
check_disk_space()
|
124 |
+
|
125 |
+
print("\nAll tests completed.")
|
requirements.txt
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch>=1.13.0
|
2 |
+
numpy>=1.22.0
|
3 |
+
transformers>=4.26.0
|
4 |
+
gensim>=4.3.0
|
5 |
+
fastapi>=0.95.0
|
6 |
+
uvicorn>=0.22.0
|
7 |
+
shap>=0.41.0
|
8 |
+
pydantic>=1.10.7
|
9 |
+
sentence-transformers>=2.2.2
|
10 |
+
# Specific models needed
|
11 |
+
tokenizers>=0.13.2
|
12 |
+
sentencepiece>=0.1.97
|
13 |
+
# For text processing
|
14 |
+
nltk>=3.7
|
15 |
+
# For web requests
|
16 |
+
requests>=2.28.0
|
17 |
+
# For CORS
|
18 |
+
starlette>=0.26.0
|
src/ProcessOneSingleCampaign.py
ADDED
@@ -0,0 +1,296 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
# Set gensim data directory to a writable location at the very start
|
3 |
+
os.environ['GENSIM_DATA_DIR'] = '/tmp/gensim-data'
|
4 |
+
try:
|
5 |
+
os.makedirs('/tmp/gensim-data', exist_ok=True)
|
6 |
+
print(f"Created directory at {os.environ['GENSIM_DATA_DIR']}")
|
7 |
+
except Exception as e:
|
8 |
+
print(f"Error creating gensim directory: {str(e)}")
|
9 |
+
|
10 |
+
import json
|
11 |
+
import numpy as np
|
12 |
+
from typing import Dict
|
13 |
+
import torch
|
14 |
+
from transformers import AutoTokenizer, AutoModel
|
15 |
+
import gc
|
16 |
+
import gensim.downloader
|
17 |
+
|
18 |
+
class CampaignProcessor:
|
19 |
+
def __init__(self, data, lazy_load=False):
|
20 |
+
self.data = data
|
21 |
+
self.categories = sorted(list(set(camp.get('raw_category', '') for camp in self.data)))
|
22 |
+
self.lazy_load = lazy_load
|
23 |
+
|
24 |
+
self.tokenizer = None
|
25 |
+
self.model = None
|
26 |
+
self.RiskandBlurb_tokenizer = None
|
27 |
+
self.RiskandBlurb_model = None
|
28 |
+
self.glove = None
|
29 |
+
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
30 |
+
|
31 |
+
if not lazy_load:
|
32 |
+
self._load_models()
|
33 |
+
|
34 |
+
def _load_models(self):
|
35 |
+
print("Loading NLP models...")
|
36 |
+
# Cache models locally to avoid downloading every time
|
37 |
+
cache_dir = "/tmp/model_cache"
|
38 |
+
os.environ["TRANSFORMERS_CACHE"] = cache_dir
|
39 |
+
os.environ["HF_HOME"] = cache_dir
|
40 |
+
|
41 |
+
try:
|
42 |
+
os.makedirs(cache_dir, exist_ok=True)
|
43 |
+
print(f"Created cache directory at {cache_dir}")
|
44 |
+
except Exception as e:
|
45 |
+
print(f"Error creating cache directory: {str(e)}")
|
46 |
+
|
47 |
+
# Initialize Longformer model and tokenizer (for processing description)
|
48 |
+
model_name = "allenai/longformer-base-4096"
|
49 |
+
print(f"Loading {model_name}...")
|
50 |
+
|
51 |
+
try:
|
52 |
+
# Add internet connectivity check
|
53 |
+
try:
|
54 |
+
import requests
|
55 |
+
print("Testing internet connectivity...")
|
56 |
+
response = requests.get("https://huggingface.co", timeout=5)
|
57 |
+
if response.status_code == 200:
|
58 |
+
print("Successfully connected to huggingface.co")
|
59 |
+
else:
|
60 |
+
print(f"Error connecting to huggingface.co: {response.status_code}")
|
61 |
+
except Exception as e:
|
62 |
+
print(f"Network connectivity test failed: {str(e)}")
|
63 |
+
|
64 |
+
# Check if directory exists and is writable
|
65 |
+
if os.path.exists(cache_dir):
|
66 |
+
print(f"Cache directory {cache_dir} exists")
|
67 |
+
if os.access(cache_dir, os.W_OK):
|
68 |
+
print(f"Cache directory {cache_dir} is writable")
|
69 |
+
else:
|
70 |
+
print(f"Cache directory {cache_dir} is not writable")
|
71 |
+
else:
|
72 |
+
print(f"Cache directory {cache_dir} does not exist")
|
73 |
+
|
74 |
+
# Try loading with explicit cache_dir parameter
|
75 |
+
from transformers import AutoTokenizer
|
76 |
+
print(f"Initializing tokenizer from {model_name} with cache_dir={cache_dir}")
|
77 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
|
78 |
+
print(f"Tokenizer loaded successfully")
|
79 |
+
|
80 |
+
# Load model with explicit cache_dir parameter
|
81 |
+
from transformers import AutoModel
|
82 |
+
print(f"Initializing model from {model_name} with cache_dir={cache_dir}")
|
83 |
+
self.model = AutoModel.from_pretrained(model_name, cache_dir=cache_dir)
|
84 |
+
print(f"Model loaded successfully")
|
85 |
+
except Exception as e:
|
86 |
+
print(f"Error loading Longformer model: {str(e)}")
|
87 |
+
# Continue with a fallback approach or raise the exception
|
88 |
+
raise e
|
89 |
+
|
90 |
+
try:
|
91 |
+
# Initialize minilm model and tokenizer (for processing risk and blurb)
|
92 |
+
RiskandBlurb_model_name = "sentence-transformers/all-minilm-l6-v2"
|
93 |
+
print(f"Loading {RiskandBlurb_model_name}...")
|
94 |
+
self.RiskandBlurb_tokenizer = AutoTokenizer.from_pretrained(RiskandBlurb_model_name, cache_dir=cache_dir)
|
95 |
+
self.RiskandBlurb_model = AutoModel.from_pretrained(RiskandBlurb_model_name, cache_dir=cache_dir)
|
96 |
+
print(f"RiskandBlurb model loaded successfully")
|
97 |
+
except Exception as e:
|
98 |
+
print(f"Error loading minilm model: {str(e)}")
|
99 |
+
raise e
|
100 |
+
|
101 |
+
try:
|
102 |
+
# Load GloVe model for country and subcategory embeddings
|
103 |
+
print("Loading GloVe model...")
|
104 |
+
# GENSIM_DATA_DIR is already set at the top of the file
|
105 |
+
print(f"Using GENSIM_DATA_DIR: {os.environ.get('GENSIM_DATA_DIR', 'Not set')}")
|
106 |
+
|
107 |
+
self.glove = gensim.downloader.load('glove-wiki-gigaword-100')
|
108 |
+
print("GloVe model loaded successfully")
|
109 |
+
except Exception as e:
|
110 |
+
print(f"Error loading GloVe model: {str(e)}")
|
111 |
+
raise e
|
112 |
+
|
113 |
+
try:
|
114 |
+
# Move models to device
|
115 |
+
self.model = self.model.to(self.device)
|
116 |
+
self.RiskandBlurb_model = self.RiskandBlurb_model.to(self.device)
|
117 |
+
print("All models loaded successfully.")
|
118 |
+
except Exception as e:
|
119 |
+
print(f"Error moving models to device: {str(e)}")
|
120 |
+
raise e
|
121 |
+
|
122 |
+
def _ensure_models_loaded(self):
|
123 |
+
if self.model is None or self.tokenizer is None or self.RiskandBlurb_model is None or self.RiskandBlurb_tokenizer is None or self.glove is None:
|
124 |
+
self._load_models()
|
125 |
+
|
126 |
+
def _process_text_embedding(self, text, max_length, tokenizer, model):
|
127 |
+
# Common function for text embedding generation
|
128 |
+
if self.device.type == 'cuda':
|
129 |
+
torch.cuda.empty_cache()
|
130 |
+
gc.collect()
|
131 |
+
|
132 |
+
inputs = tokenizer(text,
|
133 |
+
padding=True,
|
134 |
+
truncation=True,
|
135 |
+
max_length=max_length,
|
136 |
+
return_tensors="pt")
|
137 |
+
|
138 |
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
139 |
+
|
140 |
+
with torch.no_grad():
|
141 |
+
outputs = model(**inputs)
|
142 |
+
|
143 |
+
attention_mask = inputs['attention_mask']
|
144 |
+
token_embeddings = outputs.last_hidden_state
|
145 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
146 |
+
sentence_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
147 |
+
|
148 |
+
embedding = sentence_embeddings.cpu().numpy()
|
149 |
+
|
150 |
+
del inputs, outputs, token_embeddings, sentence_embeddings
|
151 |
+
if self.device.type == 'cuda':
|
152 |
+
torch.cuda.empty_cache()
|
153 |
+
gc.collect()
|
154 |
+
|
155 |
+
return embedding[0]
|
156 |
+
|
157 |
+
def _get_glove_embedding(self, text, dim=100):
|
158 |
+
# Common function for GloVe embeddings (subcategory and country)
|
159 |
+
if not text:
|
160 |
+
return np.zeros(dim)
|
161 |
+
|
162 |
+
# Normalize and split text
|
163 |
+
text = text.lower().strip()
|
164 |
+
words = text.split()
|
165 |
+
vectors = []
|
166 |
+
|
167 |
+
for word in words:
|
168 |
+
if word in self.glove:
|
169 |
+
vectors.append(self.glove[word])
|
170 |
+
|
171 |
+
if vectors:
|
172 |
+
return np.mean(vectors, axis=0)
|
173 |
+
else:
|
174 |
+
return np.zeros(dim)
|
175 |
+
|
176 |
+
def process_description_embedding(self, campaign: Dict, idx: int):
|
177 |
+
self._ensure_models_loaded()
|
178 |
+
|
179 |
+
try:
|
180 |
+
text = campaign.get("raw_description", '')
|
181 |
+
description_length = len(text.split())
|
182 |
+
embedding = self._process_text_embedding(text, 4096, self.tokenizer, self.model)
|
183 |
+
return embedding, description_length
|
184 |
+
except Exception as e:
|
185 |
+
print(f"Error processing description: {str(e)}")
|
186 |
+
return np.zeros(768), 0
|
187 |
+
|
188 |
+
def process_riskandchallenges_embedding(self, campaign: Dict, idx: int):
|
189 |
+
self._ensure_models_loaded()
|
190 |
+
|
191 |
+
try:
|
192 |
+
text = campaign.get("raw_risks", '')
|
193 |
+
return self._process_text_embedding(text, 512, self.RiskandBlurb_tokenizer, self.RiskandBlurb_model)
|
194 |
+
except Exception as e:
|
195 |
+
print(f"Error processing risk statement: {str(e)}")
|
196 |
+
return np.zeros(384)
|
197 |
+
|
198 |
+
def process_blurb(self, campaign: Dict, idx: int):
|
199 |
+
self._ensure_models_loaded()
|
200 |
+
|
201 |
+
try:
|
202 |
+
text = campaign.get("raw_blurb", '')
|
203 |
+
return self._process_text_embedding(text, 512, self.RiskandBlurb_tokenizer, self.RiskandBlurb_model)
|
204 |
+
except Exception as e:
|
205 |
+
print(f"Error processing blurb: {str(e)}")
|
206 |
+
return np.zeros(384)
|
207 |
+
|
208 |
+
def process_category(self, campaign: Dict):
|
209 |
+
try:
|
210 |
+
# All categories in the dataset
|
211 |
+
fixed_categories = [
|
212 |
+
"Art", "Comics", "Crafts", "Dance", "Design", "Fashion",
|
213 |
+
"Film & Video", "Food", "Games", "Journalism", "Music",
|
214 |
+
"Photography", "Publishing", "Technology", "Theater"
|
215 |
+
]
|
216 |
+
|
217 |
+
category = campaign.get('raw_category', '')
|
218 |
+
# Create one-hot encoding
|
219 |
+
encoding = [1 if cat == category else 0 for cat in fixed_categories]
|
220 |
+
return encoding
|
221 |
+
except Exception as e:
|
222 |
+
print(f"Error processing category: {str(e)}")
|
223 |
+
return [0] * 15
|
224 |
+
|
225 |
+
def process_subcategory_embedding(self, campaign: Dict, idx: int):
|
226 |
+
self._ensure_models_loaded()
|
227 |
+
|
228 |
+
try:
|
229 |
+
subcategory = campaign.get('raw_subcategory', '')
|
230 |
+
return self._get_glove_embedding(subcategory)
|
231 |
+
except Exception as e:
|
232 |
+
print(f"Error processing subcategory: {str(e)}")
|
233 |
+
return np.zeros(100)
|
234 |
+
|
235 |
+
def process_country_embedding(self, campaign: Dict, idx: int):
|
236 |
+
self._ensure_models_loaded()
|
237 |
+
|
238 |
+
try:
|
239 |
+
country = campaign.get('raw_country', '')
|
240 |
+
return self._get_glove_embedding(country)
|
241 |
+
except Exception as e:
|
242 |
+
print(f"Error processing country: {str(e)}")
|
243 |
+
return np.zeros(100)
|
244 |
+
|
245 |
+
def process_funding_goal(self, campaign: Dict, idx: int):
|
246 |
+
return float(campaign.get('funding_goal', 0))
|
247 |
+
|
248 |
+
def process_previous_funding_goal(self, campaign: Dict, idx: int):
|
249 |
+
return float(campaign.get('previous_funding_goal', 0))
|
250 |
+
|
251 |
+
def process_previous_pledged(self, campaign: Dict, idx: int):
|
252 |
+
return float(campaign.get('previous_pledged', 0))
|
253 |
+
|
254 |
+
def calculate_previous_sucess_rate(self, campaign: Dict, idx: int):
|
255 |
+
return float(campaign.get('previous_success_rate', 0))
|
256 |
+
|
257 |
+
def process_campaign(self, campaign: Dict, idx: int):
|
258 |
+
self._ensure_models_loaded()
|
259 |
+
|
260 |
+
# Generate embeddings for text fields
|
261 |
+
description_embedding, calculated_description_length = self.process_description_embedding(campaign, idx)
|
262 |
+
|
263 |
+
# Use existing value for description_length if present, otherwise use calculated
|
264 |
+
description_length = campaign.get('description_length', calculated_description_length)
|
265 |
+
|
266 |
+
# Create processed data dictionary with embeddings and numerical features
|
267 |
+
result = {
|
268 |
+
'description_embedding': description_embedding.tolist(),
|
269 |
+
'description_length': description_length,
|
270 |
+
'blurb_embedding': self.process_blurb(campaign, idx).tolist(),
|
271 |
+
'risk_embedding': self.process_riskandchallenges_embedding(campaign, idx).tolist(),
|
272 |
+
'category_embedding': self.process_category(campaign),
|
273 |
+
'subcategory_embedding': self.process_subcategory_embedding(campaign, idx).tolist(),
|
274 |
+
'country_embedding': self.process_country_embedding(campaign, idx).tolist()
|
275 |
+
}
|
276 |
+
|
277 |
+
# Process numerical features or use existing values from input
|
278 |
+
numerical_fields = [
|
279 |
+
('funding_goal', self.process_funding_goal),
|
280 |
+
('previous_funding_goal', self.process_previous_funding_goal),
|
281 |
+
('previous_pledged', self.process_previous_pledged),
|
282 |
+
('previous_success_rate', self.calculate_previous_sucess_rate)
|
283 |
+
]
|
284 |
+
|
285 |
+
# Process numerical features or use values from input
|
286 |
+
for field_name, processor_func in numerical_fields:
|
287 |
+
if field_name in campaign:
|
288 |
+
result[field_name] = campaign[field_name]
|
289 |
+
else:
|
290 |
+
result[field_name] = processor_func(campaign, idx)
|
291 |
+
|
292 |
+
# Simple integer fields
|
293 |
+
for field in ['image_count', 'video_count', 'campaign_duration', 'previous_projects_count']:
|
294 |
+
result[field] = int(campaign.get(field, 0))
|
295 |
+
|
296 |
+
return result
|
src/__pycache__/ProcessOneSingleCampaign.cpython-311.pyc
ADDED
Binary file (14.4 kB). View file
|
|
src/__pycache__/explainer.cpython-310.pyc
ADDED
Binary file (3.11 kB). View file
|
|
src/__pycache__/explainer.cpython-311.pyc
ADDED
Binary file (5.88 kB). View file
|
|
src/__pycache__/model.cpython-310.pyc
ADDED
Binary file (4.3 kB). View file
|
|
src/__pycache__/model.cpython-311.pyc
ADDED
Binary file (6.39 kB). View file
|
|
src/explainer.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import numpy as np
|
3 |
+
from typing import Dict, Tuple
|
4 |
+
|
5 |
+
from src.model import KickstarterModel
|
6 |
+
|
7 |
+
class KickstarterExplainer:
|
8 |
+
"""Kickstarter prediction model explainer"""
|
9 |
+
|
10 |
+
def __init__(self, model: KickstarterModel, device: torch.device = None):
|
11 |
+
"""
|
12 |
+
Initialize the explainer.
|
13 |
+
|
14 |
+
Args:
|
15 |
+
model: Trained model.
|
16 |
+
device: Computation device.
|
17 |
+
"""
|
18 |
+
self.model = model
|
19 |
+
self.device = device if device is not None else torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
20 |
+
self.model.to(self.device)
|
21 |
+
self.model.eval()
|
22 |
+
|
23 |
+
# Numerical feature names
|
24 |
+
self.numerical_feature_names = [
|
25 |
+
'description_length',
|
26 |
+
'funding_goal',
|
27 |
+
'image_count',
|
28 |
+
'video_count',
|
29 |
+
'campaign_duration',
|
30 |
+
'previous_projects_count',
|
31 |
+
'previous_success_rate',
|
32 |
+
'previous_pledged',
|
33 |
+
'previous_funding_goal'
|
34 |
+
]
|
35 |
+
|
36 |
+
# Mapping from embedding feature names to internal names
|
37 |
+
self.embedding_map = {
|
38 |
+
'description_embedding': 'description_embedding',
|
39 |
+
'blurb_embedding': 'blurb_embedding',
|
40 |
+
'risk_embedding': 'risk_embedding',
|
41 |
+
'subcategory_embedding': 'subcategory_embedding',
|
42 |
+
'category_embedding': 'category_embedding',
|
43 |
+
'country_embedding': 'country_embedding'
|
44 |
+
}
|
45 |
+
|
46 |
+
def _compute_feature_contribution(self, baseline_probs, inputs, feature_name, is_numerical=False, index=None):
|
47 |
+
# Create input containing only the current feature
|
48 |
+
feature_input = {k: torch.zeros_like(v) for k, v in inputs.items()}
|
49 |
+
|
50 |
+
if is_numerical:
|
51 |
+
feature_input['numerical_features'] = torch.zeros_like(inputs['numerical_features'])
|
52 |
+
feature_input['numerical_features'][:, index] = inputs['numerical_features'][:, index]
|
53 |
+
else:
|
54 |
+
feature_input[feature_name] = inputs[feature_name]
|
55 |
+
|
56 |
+
# Prediction
|
57 |
+
with torch.no_grad():
|
58 |
+
feature_probs, _ = self.model(feature_input)
|
59 |
+
|
60 |
+
# SHAP value is the prediction difference
|
61 |
+
return (feature_probs - baseline_probs).cpu().item()
|
62 |
+
|
63 |
+
def explain_prediction(self, inputs: Dict[str, torch.Tensor]) -> Tuple[float, Dict[str, float]]:
|
64 |
+
"""
|
65 |
+
Explain a single prediction.
|
66 |
+
|
67 |
+
Args:
|
68 |
+
inputs: Input features.
|
69 |
+
|
70 |
+
Returns:
|
71 |
+
Predicted probability and SHAP contribution values.
|
72 |
+
"""
|
73 |
+
# Move inputs to device
|
74 |
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
75 |
+
|
76 |
+
# Prediction
|
77 |
+
with torch.no_grad():
|
78 |
+
probs, _ = self.model(inputs)
|
79 |
+
|
80 |
+
# Calculate SHAP values
|
81 |
+
shap_values = {}
|
82 |
+
baseline = {k: torch.zeros_like(v) for k, v in inputs.items()}
|
83 |
+
|
84 |
+
# Predict baseline
|
85 |
+
with torch.no_grad():
|
86 |
+
baseline_probs, _ = self.model(baseline)
|
87 |
+
|
88 |
+
# Calculate SHAP values for embedding features
|
89 |
+
for feature_name, embedding_name in self.embedding_map.items():
|
90 |
+
if embedding_name in inputs:
|
91 |
+
shap_values[feature_name] = self._compute_feature_contribution(
|
92 |
+
baseline_probs, inputs, embedding_name
|
93 |
+
)
|
94 |
+
|
95 |
+
# Calculate SHAP values for numerical features
|
96 |
+
if 'numerical_features' in inputs:
|
97 |
+
num_features = inputs['numerical_features'].size(1)
|
98 |
+
for i in range(num_features):
|
99 |
+
feature_name = self.numerical_feature_names[i]
|
100 |
+
shap_values[feature_name] = self._compute_feature_contribution(
|
101 |
+
baseline_probs, inputs, 'numerical_features',
|
102 |
+
is_numerical=True, index=i
|
103 |
+
)
|
104 |
+
|
105 |
+
# Return prediction probability and SHAP values
|
106 |
+
prediction = probs.cpu().item()
|
107 |
+
|
108 |
+
return prediction, shap_values
|
src/model.py
ADDED
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
from typing import Dict, Tuple
|
5 |
+
|
6 |
+
class KickstarterModel(nn.Module):
|
7 |
+
"""Kickstarter Project Success Prediction Model"""
|
8 |
+
|
9 |
+
def __init__(
|
10 |
+
self,
|
11 |
+
desc_embedding_dim=768,
|
12 |
+
blurb_embedding_dim=384,
|
13 |
+
risk_embedding_dim=384,
|
14 |
+
subcategory_embedding_dim=100,
|
15 |
+
category_embedding_dim=15,
|
16 |
+
country_embedding_dim=100,
|
17 |
+
numerical_features_dim=9,
|
18 |
+
hidden_dim=512,
|
19 |
+
dropout_rate=0.3
|
20 |
+
):
|
21 |
+
"""
|
22 |
+
Initialize the model
|
23 |
+
|
24 |
+
Args:
|
25 |
+
desc_embedding_dim: Description embedding vector dimension
|
26 |
+
blurb_embedding_dim: Blurb embedding vector dimension
|
27 |
+
risk_embedding_dim: Risk embedding vector dimension
|
28 |
+
subcategory_embedding_dim: Subcategory embedding vector dimension
|
29 |
+
category_embedding_dim: Category embedding vector dimension
|
30 |
+
country_embedding_dim: Country embedding vector dimension
|
31 |
+
numerical_features_dim: Numerical features dimension
|
32 |
+
hidden_dim: Hidden layer dimension
|
33 |
+
dropout_rate: Dropout rate
|
34 |
+
"""
|
35 |
+
super(KickstarterModel, self).__init__()
|
36 |
+
|
37 |
+
# Helper function to create feature processing layers
|
38 |
+
def create_fc_block(input_dim, output_dim):
|
39 |
+
return nn.Sequential(
|
40 |
+
nn.Linear(input_dim, output_dim),
|
41 |
+
nn.BatchNorm1d(output_dim),
|
42 |
+
nn.ReLU(),
|
43 |
+
nn.Dropout(dropout_rate)
|
44 |
+
)
|
45 |
+
|
46 |
+
# Feature processing layers
|
47 |
+
self.desc_fc = create_fc_block(desc_embedding_dim, hidden_dim)
|
48 |
+
self.blurb_fc = create_fc_block(blurb_embedding_dim, hidden_dim // 2)
|
49 |
+
self.risk_fc = create_fc_block(risk_embedding_dim, hidden_dim // 2)
|
50 |
+
self.subcategory_fc = create_fc_block(subcategory_embedding_dim, hidden_dim // 4)
|
51 |
+
self.category_fc = create_fc_block(category_embedding_dim, hidden_dim // 8)
|
52 |
+
self.country_fc = create_fc_block(country_embedding_dim, hidden_dim // 8)
|
53 |
+
self.numerical_fc = create_fc_block(numerical_features_dim, hidden_dim // 4)
|
54 |
+
|
55 |
+
# Combined features dimension
|
56 |
+
concat_dim = (hidden_dim +
|
57 |
+
hidden_dim // 2 +
|
58 |
+
hidden_dim // 2 +
|
59 |
+
hidden_dim // 4 +
|
60 |
+
hidden_dim // 8 +
|
61 |
+
hidden_dim // 8 +
|
62 |
+
hidden_dim // 4)
|
63 |
+
|
64 |
+
# Fully connected layers
|
65 |
+
self.fc1 = create_fc_block(concat_dim, hidden_dim)
|
66 |
+
self.fc2 = create_fc_block(hidden_dim, hidden_dim // 2)
|
67 |
+
|
68 |
+
# Output layer
|
69 |
+
self.output = nn.Linear(hidden_dim // 2, 1)
|
70 |
+
|
71 |
+
# Input names for SHAP explanation
|
72 |
+
self.input_names = [
|
73 |
+
'description_embedding',
|
74 |
+
'blurb_embedding',
|
75 |
+
'risk_embedding',
|
76 |
+
'subcategory_embedding',
|
77 |
+
'category_embedding',
|
78 |
+
'country_embedding',
|
79 |
+
'numerical_features'
|
80 |
+
]
|
81 |
+
|
82 |
+
def forward(self, inputs: Dict[str, torch.Tensor]) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
|
83 |
+
"""
|
84 |
+
Forward propagation
|
85 |
+
|
86 |
+
Args:
|
87 |
+
inputs: Dictionary containing all input features
|
88 |
+
|
89 |
+
Returns:
|
90 |
+
Prediction probability and intermediate feature representations
|
91 |
+
"""
|
92 |
+
# Process embeddings
|
93 |
+
desc_out = self.desc_fc(inputs['description_embedding'])
|
94 |
+
blurb_out = self.blurb_fc(inputs['blurb_embedding'])
|
95 |
+
risk_out = self.risk_fc(inputs['risk_embedding'])
|
96 |
+
subcategory_out = self.subcategory_fc(inputs['subcategory_embedding'])
|
97 |
+
category_out = self.category_fc(inputs['category_embedding'])
|
98 |
+
country_out = self.country_fc(inputs['country_embedding'])
|
99 |
+
numerical_out = self.numerical_fc(inputs['numerical_features'])
|
100 |
+
|
101 |
+
# Concatenate all features
|
102 |
+
combined = torch.cat([
|
103 |
+
desc_out,
|
104 |
+
blurb_out,
|
105 |
+
risk_out,
|
106 |
+
subcategory_out,
|
107 |
+
category_out,
|
108 |
+
country_out,
|
109 |
+
numerical_out
|
110 |
+
], dim=1)
|
111 |
+
|
112 |
+
# Fully connected layers
|
113 |
+
x = self.fc1(combined)
|
114 |
+
x = self.fc2(x)
|
115 |
+
|
116 |
+
# Output layer
|
117 |
+
logits = self.output(x)
|
118 |
+
probs = torch.sigmoid(logits)
|
119 |
+
|
120 |
+
# Store intermediate features for SHAP explanation
|
121 |
+
intermediate_features = {
|
122 |
+
'description_embedding': desc_out,
|
123 |
+
'blurb_embedding': blurb_out,
|
124 |
+
'risk_embedding': risk_out,
|
125 |
+
'subcategory_embedding': subcategory_out,
|
126 |
+
'category_embedding': category_out,
|
127 |
+
'country_embedding': country_out,
|
128 |
+
'numerical_features': numerical_out,
|
129 |
+
'combined': combined,
|
130 |
+
'fc1': x
|
131 |
+
}
|
132 |
+
|
133 |
+
return probs.squeeze(1), intermediate_features
|
134 |
+
|
135 |
+
def predict(self, inputs: Dict[str, torch.Tensor]) -> torch.Tensor:
|
136 |
+
"""
|
137 |
+
Prediction function
|
138 |
+
|
139 |
+
Args:
|
140 |
+
inputs: Dictionary containing all input features
|
141 |
+
|
142 |
+
Returns:
|
143 |
+
Prediction probability
|
144 |
+
"""
|
145 |
+
self.eval()
|
146 |
+
with torch.no_grad():
|
147 |
+
probs, _ = self.forward(inputs)
|
148 |
+
return probs
|
test.py
ADDED
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import json
|
3 |
+
import pprint
|
4 |
+
import time
|
5 |
+
import sys
|
6 |
+
import os
|
7 |
+
import numpy as np
|
8 |
+
|
9 |
+
def check_internet_connectivity():
|
10 |
+
"""Check if we can connect to the internet"""
|
11 |
+
print("Testing internet connectivity...")
|
12 |
+
try:
|
13 |
+
response = requests.get("https://huggingface.co", timeout=5)
|
14 |
+
print(f"Connection to huggingface.co: Status {response.status_code}")
|
15 |
+
return response.status_code == 200
|
16 |
+
except Exception as e:
|
17 |
+
print(f"Error connecting to huggingface.co: {str(e)}")
|
18 |
+
return False
|
19 |
+
|
20 |
+
def check_model_repository():
|
21 |
+
"""Check if we can connect to the specific model repository"""
|
22 |
+
print("Testing connection to model repository...")
|
23 |
+
try:
|
24 |
+
# Try to access the model repository
|
25 |
+
url = "https://huggingface.co/allenai/longformer-base-4096"
|
26 |
+
response = requests.get(url, timeout=5)
|
27 |
+
print(f"Connection to model repository: Status {response.status_code}")
|
28 |
+
return response.status_code == 200
|
29 |
+
except Exception as e:
|
30 |
+
print(f"Error connecting to model repository: {str(e)}")
|
31 |
+
return False
|
32 |
+
|
33 |
+
def check_debug_endpoint(api_url):
|
34 |
+
"""Check the debug endpoint for diagnostic information"""
|
35 |
+
print(f"Checking debug endpoint at {api_url.replace('/predict', '/debug')}...")
|
36 |
+
try:
|
37 |
+
response = requests.get(api_url.replace("/predict", "/debug"), timeout=10)
|
38 |
+
if response.status_code == 200:
|
39 |
+
debug_info = response.json()
|
40 |
+
print("Debug information retrieved:")
|
41 |
+
print(f"- API Status: {debug_info.get('api_status', 'Unknown')}")
|
42 |
+
print(f"- Model Loaded: {debug_info.get('model_loaded', 'Unknown')}")
|
43 |
+
print(f"- Cache Directory Exists: {debug_info.get('model_cache_exists', 'Unknown')}")
|
44 |
+
print(f"- Temp Directory Writable: {debug_info.get('tmp_directory_writable', 'Unknown')}")
|
45 |
+
|
46 |
+
# Check internet connectivity from the server
|
47 |
+
internet_check = debug_info.get('internet_connectivity', {})
|
48 |
+
print(f"- Server Internet Connectivity: {internet_check.get('status', 'Unknown')}")
|
49 |
+
if internet_check.get('message'):
|
50 |
+
print(f" Message: {internet_check.get('message')}")
|
51 |
+
|
52 |
+
# Check tokenizer test
|
53 |
+
tokenizer_test = debug_info.get('tokenizer_test', {})
|
54 |
+
print(f"- Tokenizer Test: {tokenizer_test.get('status', 'Unknown')}")
|
55 |
+
if tokenizer_test.get('message'):
|
56 |
+
print(f" Message: {tokenizer_test.get('message')}")
|
57 |
+
|
58 |
+
# Check disk space
|
59 |
+
disk_space = debug_info.get('disk_space', {})
|
60 |
+
if disk_space.get('status') == 'ok':
|
61 |
+
print(f"- Disk Space: Total: {disk_space.get('total_gb', 0):.2f} GB, Used: {disk_space.get('used_gb', 0):.2f} GB, Free: {disk_space.get('free_gb', 0):.2f} GB ({disk_space.get('percent_used', 0):.1f}% used)")
|
62 |
+
|
63 |
+
return debug_info
|
64 |
+
else:
|
65 |
+
print(f"Error accessing debug endpoint: Status {response.status_code}")
|
66 |
+
print(response.text)
|
67 |
+
return None
|
68 |
+
except Exception as e:
|
69 |
+
print(f"Exception when accessing debug endpoint: {str(e)}")
|
70 |
+
return None
|
71 |
+
|
72 |
+
# API endpoint on Hugging Face Spaces
|
73 |
+
API_URL = "https://angusfung-kickstarter-success-prediction.hf.space/predict"
|
74 |
+
|
75 |
+
# Sample input data (similar to what would be in input.json)
|
76 |
+
campaign_data = {
|
77 |
+
"raw_description": "Introducing the AquaGo: The Smart, Eco-Friendly Portable Water Purifier! Clean water is a basic human right β yet for millions around the world, it's a daily struggle. Whether you're an outdoor adventurer, traveling to remote areas, or preparing for emergencies, access to safe drinking water should never be a compromise. That's why we created **AquaGo**, a revolutionary portable water purifier that combines cutting-edge filtration technology, smart sensors, and sustainable materials β all packed into a sleek, lightweight design you can take anywhere.",
|
78 |
+
"raw_blurb": "AquaGo is a smart, eco-friendly portable water purifier that delivers clean, safe drinking water anywhere.",
|
79 |
+
"raw_risks": "Bringing a product to market involves complex engineering, regulatory approvals, and safety testing. Delays may occur due to certification or supply chain issues.",
|
80 |
+
"raw_subcategory": "Gadgets",
|
81 |
+
"raw_category": "Technology",
|
82 |
+
"raw_country": "Canada",
|
83 |
+
"funding_goal": 2000,
|
84 |
+
"image_count": 8,
|
85 |
+
"video_count": 3,
|
86 |
+
"campaign_duration": 90,
|
87 |
+
"previous_projects_count": 5,
|
88 |
+
"previous_success_rate": 0.4,
|
89 |
+
"previous_pledged": 18745.33,
|
90 |
+
"previous_funding_goal": 23564.99
|
91 |
+
}
|
92 |
+
|
93 |
+
def predict_success(data, max_retries=3, retry_delay=10):
|
94 |
+
"""Send data to the API and get prediction results with retries"""
|
95 |
+
for attempt in range(max_retries):
|
96 |
+
try:
|
97 |
+
# Make the POST request to the API
|
98 |
+
print(f"Sending request to: {API_URL} (Attempt {attempt + 1}/{max_retries})")
|
99 |
+
response = requests.post(API_URL, json=data, timeout=60)
|
100 |
+
|
101 |
+
# Check if the request was successful
|
102 |
+
if response.status_code == 200:
|
103 |
+
return response.json()
|
104 |
+
else:
|
105 |
+
print(f"Error: {response.status_code}")
|
106 |
+
print(response.text)
|
107 |
+
|
108 |
+
if response.status_code == 500 and "Can't load tokenizer" in response.text:
|
109 |
+
print(f"The model might be downloading. Waiting {retry_delay} seconds before retry...")
|
110 |
+
time.sleep(retry_delay)
|
111 |
+
else:
|
112 |
+
# For other errors, don't retry
|
113 |
+
return None
|
114 |
+
|
115 |
+
except Exception as e:
|
116 |
+
print(f"Exception occurred: {str(e)}")
|
117 |
+
print(f"Waiting {retry_delay} seconds before retry...")
|
118 |
+
time.sleep(retry_delay)
|
119 |
+
|
120 |
+
return None
|
121 |
+
|
122 |
+
def display_results(results):
|
123 |
+
"""Display the prediction results in a user-friendly way"""
|
124 |
+
if not results:
|
125 |
+
print("No results to display.")
|
126 |
+
return
|
127 |
+
|
128 |
+
print("\n===== KICKSTARTER SUCCESS PREDICTION =====\n")
|
129 |
+
print(f"Success Probability: {results['success_probability']:.2%}")
|
130 |
+
print(f"Predicted Outcome: {results['predicted_outcome']}")
|
131 |
+
|
132 |
+
print("\n----- TOP INFLUENCING FACTORS -----")
|
133 |
+
# Get the top 5 factors by absolute magnitude
|
134 |
+
top_factors = sorted(
|
135 |
+
results['shap_values'].items(),
|
136 |
+
key=lambda x: abs(float(x[1])),
|
137 |
+
reverse=True
|
138 |
+
)[:5]
|
139 |
+
|
140 |
+
for factor, value in top_factors:
|
141 |
+
impact = "POSITIVE" if float(value) > 0 else "NEGATIVE"
|
142 |
+
print(f"{factor}: {value:.4f} ({impact})")
|
143 |
+
|
144 |
+
print("\n----- ALL SHAP VALUES -----")
|
145 |
+
pp = pprint.PrettyPrinter(indent=2)
|
146 |
+
pp.pprint(results['shap_values'])
|
147 |
+
|
148 |
+
# Display Longformer embedding information if available
|
149 |
+
if 'longformer_embedding' in results:
|
150 |
+
embedding = np.array(results['longformer_embedding'])
|
151 |
+
print("\n----- LONGFORMER EMBEDDING -----")
|
152 |
+
print(f"Embedding Shape: {embedding.shape if hasattr(embedding, 'shape') else len(embedding)}")
|
153 |
+
print(f"First 10 values: {embedding[:10]}")
|
154 |
+
|
155 |
+
# Calculate some basic statistics on the embedding
|
156 |
+
try:
|
157 |
+
embedding_np = np.array(embedding)
|
158 |
+
print(f"Mean: {np.mean(embedding_np):.4f}")
|
159 |
+
print(f"Std: {np.std(embedding_np):.4f}")
|
160 |
+
print(f"Min: {np.min(embedding_np):.4f}")
|
161 |
+
print(f"Max: {np.max(embedding_np):.4f}")
|
162 |
+
except Exception as e:
|
163 |
+
print(f"Error calculating embedding statistics: {str(e)}")
|
164 |
+
|
165 |
+
# Main execution
|
166 |
+
if __name__ == "__main__":
|
167 |
+
print("==== DIAGNOSTICS ====")
|
168 |
+
print("Testing connectivity from client machine...")
|
169 |
+
internet_ok = check_internet_connectivity()
|
170 |
+
repo_ok = check_model_repository()
|
171 |
+
|
172 |
+
debug_info = check_debug_endpoint(API_URL)
|
173 |
+
|
174 |
+
print("\n==== PREDICTION TEST ====")
|
175 |
+
if not internet_ok:
|
176 |
+
print("WARNING: Internet connectivity issues detected on client machine.")
|
177 |
+
|
178 |
+
if not repo_ok:
|
179 |
+
print("WARNING: Cannot access model repository from client machine.")
|
180 |
+
|
181 |
+
print("Sending prediction request...")
|
182 |
+
results = predict_success(campaign_data, max_retries=2, retry_delay=10)
|
183 |
+
display_results(results)
|