Spaces:

angusfung
/

Kickstarter-prediction-embedding

Sleeping

App Files Files Community

angusfung commited on Apr 19

Commit

7812756

1 Parent(s): 4df9cee

Initial setup with Longformer embedding feature

Browse files

Files changed (16) hide show

.huggingface-space +8 -0
Dockerfile +16 -0
README.md +117 -6
app.py +283 -0
best_model.pth +3 -0
network_test.py +125 -0
requirements.txt +18 -0
src/ProcessOneSingleCampaign.py +296 -0
src/__pycache__/ProcessOneSingleCampaign.cpython-311.pyc +0 -0
src/__pycache__/explainer.cpython-310.pyc +0 -0
src/__pycache__/explainer.cpython-311.pyc +0 -0
src/__pycache__/model.cpython-310.pyc +0 -0
src/__pycache__/model.cpython-311.pyc +0 -0
src/explainer.py +108 -0
src/model.py +148 -0
test.py +183 -0

.huggingface-space ADDED Viewed

	@@ -0,0 +1,8 @@

+title: Kickstarter Success Prediction
+emoji: 🚀
+colorFrom: blue
+colorTo: green
+sdk: docker
+sdk_version: "3.9"
+app_file: app.py
+pinned: false

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+FROM python:3.9-slim
+WORKDIR /app
+COPY . /app/
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+RUN pip install --no-cache-dir -r requirements.txt
+EXPOSE 7860
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -1,12 +1,123 @@
 ---
-title: Kickstarter Prediction Embedding
-emoji: 🔥
-colorFrom: gray
-colorTo: blue
 sdk: docker
 pinned: false
-license: mit
-short_description: Prediction returned with Description Embedding
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Kickstarter Success Prediction
+emoji: 🚀
+colorFrom: blue
+colorTo: green
 sdk: docker
+sdk_version: "3.9"
+app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# Kickstarter Success Prediction API
+This API predicts the success probability of Kickstarter campaigns using machine learning.
+## API Usage
+### Endpoint: `/predict`
+Send a POST request with your campaign data in JSON format.
+### Input Format
+```json
+{
+  "raw_description": "Detailed project description...",
+  "raw_blurb": "Short project summary...",
+  "raw_risks": "Project risks and challenges...",
+  "raw_category": "Technology",
+  "raw_subcategory": "Gadgets",
+  "raw_country": "Canada",
+  "description_length": 557,
+  "funding_goal": 58000,
+  "image_count": 8,
+  "video_count": 3,
+  "campaign_duration": 90,
+  "previous_projects_count": 5,
+  "previous_success_rate": 0.4,
+  "previous_pledged": 18745.33,
+  "previous_funding_goal": 23564.99
+}
+```
+The system will use the provided numerical values directly if they exist in the input. If any numerical fields are missing, they will be calculated during preprocessing.
+### Output Format
+```json
+{
+  "success_probability": 0.7532,
+  "predicted_outcome": "Success",
+  "shap_values": {
+    "funding_goal": -0.8991450071334839,
+    "description_embedding": -0.04273056983947754,
+    "subcategory_embedding": 0.011444330215454102,
+    "previous_funding_goal": -0.008600413799285889,
+    "video_count": 0.0037734508514404297,
+    ...
+  },
+  "longformer_embedding": [0.0213, -0.0124, 0.0342, ..., 0.0547]
+}
+```
+- `success_probability`: A value between 0 and 1 representing the likelihood of project success
+- `predicted_outcome`: "Success" if probability ≥ 0.5, otherwise "Failure"
+- `shap_values`: Contribution of each feature to the prediction (positive values increase success probability, negative values decrease it)
+- `longformer_embedding`: The 768-dimensional vector generated by the Longformer model representing the semantic content of the project description (useful for further analysis or clustering)
+## Example Usage with Python
+```python
+import requests
+import json
+import numpy as np
+# API endpoint
+api_url = "https://huggingface.co/spaces/angusfung/kickstarter-success-prediction/predict"
+# Load your campaign data
+campaign_data = {
+  "raw_description": "Introducing the AquaGo...",
+  "raw_blurb": "AquaGo is a smart, eco-friendly portable water purifier...",
+  "raw_risks": "Bringing a product to market involves...",
+  "raw_subcategory": "Gadgets",
+  "raw_category": "Technology",
+  "raw_country": "Canada",
+  "funding_goal": 2000,
+  "image_count": 8,
+  "video_count": 3
+}
+# Make prediction request
+response = requests.post(api_url, json=campaign_data)
+# Print results
+if response.status_code == 200:
+    result = response.json()
+    print(f"Success Probability: {result['success_probability']:.2f}")
+    print(f"Predicted Outcome: {result['predicted_outcome']}")
+    print("\nTop 5 SHAP Values (Feature Importance):")
+    for i, (feature, value) in enumerate(list(result['shap_values'].items())[:5]):
+        print(f"{feature}: {value:.4f}")
+    # Access the longformer embedding if needed
+    if 'longformer_embedding' in result:
+        embedding = np.array(result['longformer_embedding'])
+        print(f"\nLongformer Embedding Shape: {embedding.shape}")
+else:
+    print(f"Error: {response.status_code}")
+    print(response.text)
+```
+## Example Usage with cURL
+```bash
+curl -X POST "https://huggingface.co/spaces/angusfung/kickstarter-success-prediction/predict" \
+     -H "Content-Type: application/json" \
+     -d @campaign.json
+```
+Where `campaign.json` contains your campaign data in the format described above.

app.py ADDED Viewed

	@@ -0,0 +1,283 @@

+import os
+import json
+import torch
+import numpy as np
+import logging
+from pathlib import Path
+from contextlib import asynccontextmanager
+from fastapi import FastAPI, HTTPException, Request, Response
+from fastapi.responses import JSONResponse
+from fastapi.middleware.cors import CORSMiddleware
+from src.model import KickstarterModel
+from src.explainer import KickstarterExplainer
+from src.ProcessOneSingleCampaign import CampaignProcessor
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[logging.StreamHandler()]
+)
+logger = logging.getLogger(__name__)
+# Allow numpy.core.multiarray.scalar to be loaded safely
+try:
+    import numpy.core.multiarray
+    torch.serialization.add_safe_globals([numpy.core.multiarray.scalar])
+    logger.info("Added numpy.core.multiarray.scalar to safe globals")
+except Exception as e:
+    logger.warning(f"Failed to add safe globals: {str(e)}")
+# Constants
+NUMERICAL_FIELDS = [
+    'description_length', 'funding_goal', 'image_count', 'video_count',
+    'campaign_duration', 'previous_projects_count', 'previous_success_rate',
+    'previous_pledged', 'previous_funding_goal'
+]
+EMBEDDING_NAMES = [
+    'description_embedding', 'blurb_embedding', 'risk_embedding',
+    'subcategory_embedding', 'category_embedding', 'country_embedding'
+]
+# Global variables to store the model and processor
+model = None
+explainer = None
+processor = None
+device = None
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    # Load resources on startup
+    global model, explainer, processor, device
+    logger.info("Starting application initialization...")
+    # Set device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    logger.info(f"Using device: {device}")
+    # Create cache directories in /tmp which is writable
+    cache_dir = "/tmp/model_cache"
+    os.makedirs(cache_dir, exist_ok=True)
+    logger.info(f"Created cache directory at {cache_dir}")
+    # Set environment variables for model caching
+    os.environ["TRANSFORMERS_CACHE"] = cache_dir
+    os.environ["HF_HOME"] = cache_dir
+    # Load the CampaignProcessor with lazy loading
+    logger.info("Initializing CampaignProcessor...")
+    processor = CampaignProcessor(data=[], lazy_load=True)
+    # Load model with default parameters
+    model_path = "best_model.pth"
+    hidden_dim = 256
+    logger.info(f"Initializing KickstarterModel with hidden_dim={hidden_dim}...")
+    model = KickstarterModel(hidden_dim=hidden_dim)
+    if os.path.exists(model_path):
+        logger.info(f"Loading model weights from {model_path}...")
+        try:
+            # Using both approaches for maximum compatibility
+            # 1. Added safe globals above
+            # 2. Setting weights_only=False explicitly
+            checkpoint = torch.load(model_path, map_location=device, weights_only=False)
+            model.load_state_dict(checkpoint['model_state_dict'])
+            model.to(device)
+            model.eval()  # Set model to evaluation mode
+            logger.info("Model loaded successfully!")
+        except Exception as e:
+            logger.error(f"Error loading model weights: {str(e)}")
+            logger.info("Continuing with uninitialized model weights.")
+    else:
+        logger.warning(f"Model file not found: {model_path}")
+        logger.info("Continuing with uninitialized model weights.")
+    # Initialize explainer
+    logger.info("Initializing KickstarterExplainer...")
+    explainer = KickstarterExplainer(model, device)
+    logger.info("Application initialization completed successfully!")
+    yield
+    # Clean up resources on shutdown
+    logger.info("Cleaning up resources...")
+app = FastAPI(
+    title="Kickstarter Success Prediction API",
+    description="API for predicting the success of Kickstarter campaigns",
+    version="1.0.0",
+    lifespan=lifespan,
+)
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.get("/")
+async def root():
+    return {
+        "message": "Kickstarter Success Prediction API",
+        "description": "Send a POST request to /predict with campaign data to get a prediction"
+    }
+@app.post("/predict")
+async def predict(request: Request):
+    try:
+        # Parse the incoming JSON data
+        logger.info("Received prediction request")
+        campaign_data = await request.json()
+        logger.info(f"Campaign data received: {json.dumps(campaign_data)[:100]}...")
+        # Process the campaign data
+        logger.info("Processing campaign data...")
+        processed_data = preprocess_raw_data(campaign_data)
+        logger.info("Campaign data processed successfully")
+        # Store the raw longformer embedding for returning in the response
+        raw_longformer_embedding = None
+        if 'description_embedding' in processed_data:
+            raw_longformer_embedding = processed_data['description_embedding']
+        # Process embeddings
+        logger.info("Preparing inputs for model...")
+        processed_inputs = {}
+        for embedding_name in EMBEDDING_NAMES:
+            if embedding_name in processed_data:
+                processed_inputs[embedding_name] = torch.tensor(processed_data[embedding_name], dtype=torch.float32).unsqueeze(0)
+            else:
+                # Use appropriate zero vector
+                dim = 768 if embedding_name == 'description_embedding' else \
+                      384 if embedding_name in ['blurb_embedding', 'risk_embedding'] else \
+                      100 if embedding_name in ['subcategory_embedding', 'country_embedding'] else 15
+                processed_inputs[embedding_name] = torch.zeros((1, dim), dtype=torch.float32)
+                logger.warning(f"Using zero tensor for missing embedding: {embedding_name}")
+        # Process numerical features
+        numerical_features = [processed_data.get(field, 0) for field in NUMERICAL_FIELDS]
+        processed_inputs['numerical_features'] = torch.tensor([numerical_features], dtype=torch.float32)
+        # Predict and explain
+        logger.info("Running prediction and generating explanations...")
+        prediction, shap_values = explainer.explain_prediction(processed_inputs)
+        logger.info(f"Prediction completed: {float(prediction):.4f}")
+        # Sort SHAP values by absolute magnitude
+        sorted_shap = dict(sorted(shap_values.items(), key=lambda x: abs(x[1]), reverse=True))
+        # Return the results
+        result = {
+            "success_probability": float(prediction),
+            "predicted_outcome": "Success" if prediction >= 0.5 else "Failure",
+            "shap_values": {k: float(v) for k, v in sorted_shap.items()}
+        }
+        # Add raw longformer embedding to result if available
+        if raw_longformer_embedding is not None:
+            result["longformer_embedding"] = raw_longformer_embedding
+        logger.info("Returning prediction results")
+        return JSONResponse(content=result)
+    except Exception as e:
+        logger.error(f"Error during prediction: {str(e)}", exc_info=True)
+        raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}")
+def preprocess_raw_data(campaign_data):
+    """Preprocess raw data using CampaignProcessor"""
+    try:
+        # Process the single campaign
+        logger.info("Processing campaign with CampaignProcessor...")
+        processed_data = processor.process_campaign(campaign_data, idx=0)
+        # Preserve existing numerical values from input if present
+        for field in NUMERICAL_FIELDS:
+            if field in campaign_data:
+                processed_data[field] = campaign_data[field]
+                logger.info(f"Using provided value for {field}: {campaign_data[field]}")
+        return processed_data
+    except Exception as e:
+        logger.error(f"Error preprocessing raw data: {str(e)}", exc_info=True)
+        raise Exception(f"Error preprocessing raw data: {str(e)}")
+# Debugging endpoint to check the environment and loaded resources
+@app.get("/debug")
+async def debug():
+    """Endpoint for checking the status of the API and its components"""
+    global model, explainer, processor, device
+    # Check internet connectivity
+    internet_check = {"status": "unknown", "message": ""}
+    try:
+        import requests
+        response = requests.get("https://huggingface.co", timeout=5)
+        internet_check = {
+            "status": "connected" if response.status_code == 200 else "error",
+            "status_code": response.status_code,
+            "message": "Successfully connected to huggingface.co"
+        }
+    except Exception as e:
+        internet_check = {"status": "error", "message": f"Error connecting to internet: {str(e)}"}
+    # Try to load the tokenizer directly as a test
+    tokenizer_check = {"status": "unknown", "message": ""}
+    try:
+        from transformers import AutoTokenizer
+        cache_dir = "/tmp/model_cache"
+        os.makedirs(cache_dir, exist_ok=True)
+        test_model_name = "allenai/longformer-base-4096"
+        tokenizer = AutoTokenizer.from_pretrained(test_model_name, cache_dir=cache_dir)
+        tokenizer_check = {"status": "success", "message": f"Successfully loaded {test_model_name} tokenizer"}
+    except Exception as e:
+        tokenizer_check = {"status": "error", "message": f"Error loading tokenizer: {str(e)}"}
+    # Check disk space
+    disk_space = {"status": "unknown", "message": ""}
+    try:
+        import shutil
+        total, used, free = shutil.disk_usage("/tmp")
+        disk_space = {
+            "status": "ok",
+            "total_gb": total / (1024**3),
+            "used_gb": used / (1024**3),
+            "free_gb": free / (1024**3),
+            "percent_used": (used / total) * 100
+        }
+    except Exception as e:
+        disk_space = {"status": "error", "message": f"Error checking disk space: {str(e)}"}
+    debug_info = {
+        "api_status": "running",
+        "device": str(device),
+        "model_loaded": model is not None,
+        "explainer_loaded": explainer is not None,
+        "processor_loaded": processor is not None,
+        "cuda_available": torch.cuda.is_available(),
+        "environment_variables": {
+            "TRANSFORMERS_CACHE": os.environ.get("TRANSFORMERS_CACHE", "Not set"),
+            "HF_HOME": os.environ.get("HF_HOME", "Not set"),
+        },
+        "model_cache_exists": os.path.exists("/tmp/model_cache"),
+        "model_file_exists": os.path.exists("best_model.pth"),
+        "tmp_directory_writable": os.access("/tmp", os.W_OK),
+        "internet_connectivity": internet_check,
+        "tokenizer_test": tokenizer_check,
+        "disk_space": disk_space
+    }
+    return JSONResponse(content=debug_info)
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)

best_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c2d55ef67179ea54e94d958795cd79b7d29e8605f9e7c135d34d5dc56079e41f
+size 6324090

network_test.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import os
+import requests
+import json
+import time
+from pprint import pprint
+# Test internet connectivity
+def check_internet():
+    print("\n=== TESTING INTERNET CONNECTIVITY ===")
+    try:
+        urls = [
+            "https://huggingface.co",
+            "https://google.com",
+            "https://huggingface.co/allenai/longformer-base-4096"
+        ]
+        for url in urls:
+            try:
+                print(f"Testing connection to {url}...")
+                start_time = time.time()
+                response = requests.get(url, timeout=10)
+                elapsed = time.time() - start_time
+                print(f"  Status: {response.status_code}, Time: {elapsed:.2f}s")
+            except Exception as e:
+                print(f"  Error: {str(e)}")
+    except Exception as e:
+        print(f"Network test failed: {str(e)}")
+# Test model download
+def test_model_download():
+    print("\n=== TESTING MODEL DOWNLOAD ===")
+    try:
+        from transformers import AutoTokenizer
+        # Create cache directory
+        cache_dir = "/tmp/model_cache_test"
+        os.makedirs(cache_dir, exist_ok=True)
+        print(f"Created test cache directory at {cache_dir}")
+        # Try to download a model
+        model_name = "distilbert-base-uncased"  # Smaller model for testing
+        print(f"Trying to download {model_name}...")
+        start_time = time.time()
+        tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
+        elapsed = time.time() - start_time
+        print(f"Successfully downloaded tokenizer in {elapsed:.2f}s")
+        print(f"Tokenizer type: {type(tokenizer).__name__}")
+        # Check if files were created
+        if os.path.exists(cache_dir):
+            files = os.listdir(cache_dir)
+            print(f"Files in cache directory: {len(files)}")
+            if len(files) > 0:
+                print(f"Sample files: {files[:5]}")
+    except Exception as e:
+        print(f"Model download test failed: {str(e)}")
+# Check disk space
+def check_disk_space():
+    print("\n=== CHECKING DISK SPACE ===")
+    try:
+        import shutil
+        # Check disk space in various directories
+        directories = ["/tmp", "/", "/home"]
+        for directory in directories:
+            if os.path.exists(directory):
+                try:
+                    total, used, free = shutil.disk_usage(directory)
+                    print(f"Disk space for {directory}:")
+                    print(f"  Total: {total / (1024**3):.2f} GB")
+                    print(f"  Used: {used / (1024**3):.2f} GB")
+                    print(f"  Free: {free / (1024**3):.2f} GB")
+                    print(f"  Percent used: {(used / total) * 100:.1f}%")
+                except Exception as e:
+                    print(f"  Error checking {directory}: {str(e)}")
+    except Exception as e:
+        print(f"Disk space check failed: {str(e)}")
+# Test GloVe model loading
+def test_glove_loading():
+    print("\n=== TESTING GLOVE MODEL LOADING ===")
+    try:
+        import gensim.downloader
+        import os
+        # Set gensim data directory to a writable location
+        gensim_dir = "/tmp/gensim-data"
+        os.environ['GENSIM_DATA_DIR'] = gensim_dir
+        os.makedirs(gensim_dir, exist_ok=True)
+        print(f"Set GENSIM_DATA_DIR to {gensim_dir}")
+        # Try to download GloVe
+        print("Trying to download GloVe model (this might take a while)...")
+        start_time = time.time()
+        glove = gensim.downloader.load('glove-wiki-gigaword-100')
+        elapsed = time.time() - start_time
+        print(f"Successfully loaded GloVe model in {elapsed:.2f}s")
+        print(f"GloVe model type: {type(glove).__name__}")
+        print(f"Vocabulary size: {len(glove.key_to_index)}")
+        # Check if files were created
+        if os.path.exists(gensim_dir):
+            files = os.listdir(gensim_dir)
+            print(f"Files in GloVe directory: {len(files)}")
+            if len(files) > 0:
+                print(f"Sample files: {files[:5]}")
+    except Exception as e:
+        print(f"GloVe model loading test failed: {str(e)}")
+# Main function
+if __name__ == "__main__":
+    print("Starting network and model download tests...")
+    # Run tests
+    check_internet()
+    test_model_download()
+    test_glove_loading()
+    check_disk_space()
+    print("\nAll tests completed.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+torch>=1.13.0
+numpy>=1.22.0
+transformers>=4.26.0
+gensim>=4.3.0
+fastapi>=0.95.0
+uvicorn>=0.22.0
+shap>=0.41.0
+pydantic>=1.10.7
+sentence-transformers>=2.2.2
+# Specific models needed
+tokenizers>=0.13.2
+sentencepiece>=0.1.97
+# For text processing
+nltk>=3.7
+# For web requests
+requests>=2.28.0
+# For CORS
+starlette>=0.26.0

src/ProcessOneSingleCampaign.py ADDED Viewed

	@@ -0,0 +1,296 @@

+import os
+# Set gensim data directory to a writable location at the very start
+os.environ['GENSIM_DATA_DIR'] = '/tmp/gensim-data'
+try:
+    os.makedirs('/tmp/gensim-data', exist_ok=True)
+    print(f"Created directory at {os.environ['GENSIM_DATA_DIR']}")
+except Exception as e:
+    print(f"Error creating gensim directory: {str(e)}")
+import json
+import numpy as np
+from typing import Dict
+import torch
+from transformers import AutoTokenizer, AutoModel
+import gc
+import gensim.downloader
+class CampaignProcessor:
+    def __init__(self, data, lazy_load=False):
+        self.data = data
+        self.categories = sorted(list(set(camp.get('raw_category', '') for camp in self.data)))
+        self.lazy_load = lazy_load
+        self.tokenizer = None
+        self.model = None
+        self.RiskandBlurb_tokenizer = None
+        self.RiskandBlurb_model = None
+        self.glove = None
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        if not lazy_load:
+            self._load_models()
+    def _load_models(self):
+        print("Loading NLP models...")
+        # Cache models locally to avoid downloading every time
+        cache_dir = "/tmp/model_cache"
+        os.environ["TRANSFORMERS_CACHE"] = cache_dir
+        os.environ["HF_HOME"] = cache_dir
+        try:
+            os.makedirs(cache_dir, exist_ok=True)
+            print(f"Created cache directory at {cache_dir}")
+        except Exception as e:
+            print(f"Error creating cache directory: {str(e)}")
+        # Initialize Longformer model and tokenizer (for processing description)
+        model_name = "allenai/longformer-base-4096"
+        print(f"Loading {model_name}...")
+        try:
+            # Add internet connectivity check
+            try:
+                import requests
+                print("Testing internet connectivity...")
+                response = requests.get("https://huggingface.co", timeout=5)
+                if response.status_code == 200:
+                    print("Successfully connected to huggingface.co")
+                else:
+                    print(f"Error connecting to huggingface.co: {response.status_code}")
+            except Exception as e:
+                print(f"Network connectivity test failed: {str(e)}")
+            # Check if directory exists and is writable
+            if os.path.exists(cache_dir):
+                print(f"Cache directory {cache_dir} exists")
+                if os.access(cache_dir, os.W_OK):
+                    print(f"Cache directory {cache_dir} is writable")
+                else:
+                    print(f"Cache directory {cache_dir} is not writable")
+            else:
+                print(f"Cache directory {cache_dir} does not exist")
+            # Try loading with explicit cache_dir parameter
+            from transformers import AutoTokenizer
+            print(f"Initializing tokenizer from {model_name} with cache_dir={cache_dir}")
+            self.tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
+            print(f"Tokenizer loaded successfully")
+            # Load model with explicit cache_dir parameter
+            from transformers import AutoModel
+            print(f"Initializing model from {model_name} with cache_dir={cache_dir}")
+            self.model = AutoModel.from_pretrained(model_name, cache_dir=cache_dir)
+            print(f"Model loaded successfully")
+        except Exception as e:
+            print(f"Error loading Longformer model: {str(e)}")
+            # Continue with a fallback approach or raise the exception
+            raise e
+        try:
+            # Initialize minilm model and tokenizer (for processing risk and blurb)
+            RiskandBlurb_model_name = "sentence-transformers/all-minilm-l6-v2"
+            print(f"Loading {RiskandBlurb_model_name}...")
+            self.RiskandBlurb_tokenizer = AutoTokenizer.from_pretrained(RiskandBlurb_model_name, cache_dir=cache_dir)
+            self.RiskandBlurb_model = AutoModel.from_pretrained(RiskandBlurb_model_name, cache_dir=cache_dir)
+            print(f"RiskandBlurb model loaded successfully")
+        except Exception as e:
+            print(f"Error loading minilm model: {str(e)}")
+            raise e
+        try:
+            # Load GloVe model for country and subcategory embeddings
+            print("Loading GloVe model...")
+            # GENSIM_DATA_DIR is already set at the top of the file
+            print(f"Using GENSIM_DATA_DIR: {os.environ.get('GENSIM_DATA_DIR', 'Not set')}")
+            self.glove = gensim.downloader.load('glove-wiki-gigaword-100')
+            print("GloVe model loaded successfully")
+        except Exception as e:
+            print(f"Error loading GloVe model: {str(e)}")
+            raise e
+        try:
+            # Move models to device
+            self.model = self.model.to(self.device)
+            self.RiskandBlurb_model = self.RiskandBlurb_model.to(self.device)
+            print("All models loaded successfully.")
+        except Exception as e:
+            print(f"Error moving models to device: {str(e)}")
+            raise e
+    def _ensure_models_loaded(self):
+        if self.model is None or self.tokenizer is None or self.RiskandBlurb_model is None or self.RiskandBlurb_tokenizer is None or self.glove is None:
+            self._load_models()
+    def _process_text_embedding(self, text, max_length, tokenizer, model):
+        # Common function for text embedding generation
+        if self.device.type == 'cuda':
+            torch.cuda.empty_cache()
+        gc.collect()
+        inputs = tokenizer(text,
+                        padding=True,
+                        truncation=True,
+                        max_length=max_length,
+                        return_tensors="pt")
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = model(**inputs)
+        attention_mask = inputs['attention_mask']
+        token_embeddings = outputs.last_hidden_state
+        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+        sentence_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+        embedding = sentence_embeddings.cpu().numpy()
+        del inputs, outputs, token_embeddings, sentence_embeddings
+        if self.device.type == 'cuda':
+            torch.cuda.empty_cache()
+        gc.collect()
+        return embedding[0]
+    def _get_glove_embedding(self, text, dim=100):
+        # Common function for GloVe embeddings (subcategory and country)
+        if not text:
+            return np.zeros(dim)
+        # Normalize and split text
+        text = text.lower().strip()
+        words = text.split()
+        vectors = []
+        for word in words:
+            if word in self.glove:
+                vectors.append(self.glove[word])
+        if vectors:
+            return np.mean(vectors, axis=0)
+        else:
+            return np.zeros(dim)
+    def process_description_embedding(self, campaign: Dict, idx: int):
+        self._ensure_models_loaded()
+        try:
+            text = campaign.get("raw_description", '')
+            description_length = len(text.split())
+            embedding = self._process_text_embedding(text, 4096, self.tokenizer, self.model)
+            return embedding, description_length
+        except Exception as e:
+            print(f"Error processing description: {str(e)}")
+            return np.zeros(768), 0
+    def process_riskandchallenges_embedding(self, campaign: Dict, idx: int):
+        self._ensure_models_loaded()
+        try:
+            text = campaign.get("raw_risks", '')
+            return self._process_text_embedding(text, 512, self.RiskandBlurb_tokenizer, self.RiskandBlurb_model)
+        except Exception as e:
+            print(f"Error processing risk statement: {str(e)}")
+            return np.zeros(384)
+    def process_blurb(self, campaign: Dict, idx: int):
+        self._ensure_models_loaded()
+        try:
+            text = campaign.get("raw_blurb", '')
+            return self._process_text_embedding(text, 512, self.RiskandBlurb_tokenizer, self.RiskandBlurb_model)
+        except Exception as e:
+            print(f"Error processing blurb: {str(e)}")
+            return np.zeros(384)
+    def process_category(self, campaign: Dict):
+        try:
+            # All categories in the dataset
+            fixed_categories = [
+            "Art", "Comics", "Crafts", "Dance", "Design", "Fashion",
+            "Film & Video", "Food", "Games", "Journalism", "Music",
+            "Photography", "Publishing", "Technology", "Theater"
+            ]
+            category = campaign.get('raw_category', '')
+            # Create one-hot encoding
+            encoding = [1 if cat == category else 0 for cat in fixed_categories]
+            return encoding
+        except Exception as e:
+            print(f"Error processing category: {str(e)}")
+            return [0] * 15
+    def process_subcategory_embedding(self, campaign: Dict, idx: int):
+        self._ensure_models_loaded()
+        try:
+            subcategory = campaign.get('raw_subcategory', '')
+            return self._get_glove_embedding(subcategory)
+        except Exception as e:
+            print(f"Error processing subcategory: {str(e)}")
+            return np.zeros(100)
+    def process_country_embedding(self, campaign: Dict, idx: int):
+        self._ensure_models_loaded()
+        try:
+            country = campaign.get('raw_country', '')
+            return self._get_glove_embedding(country)
+        except Exception as e:
+            print(f"Error processing country: {str(e)}")
+            return np.zeros(100)
+    def process_funding_goal(self, campaign: Dict, idx: int):
+        return float(campaign.get('funding_goal', 0))
+    def process_previous_funding_goal(self, campaign: Dict, idx: int):
+        return float(campaign.get('previous_funding_goal', 0))
+    def process_previous_pledged(self, campaign: Dict, idx: int):
+        return float(campaign.get('previous_pledged', 0))
+    def calculate_previous_sucess_rate(self, campaign: Dict, idx: int):
+        return float(campaign.get('previous_success_rate', 0))
+    def process_campaign(self, campaign: Dict, idx: int):
+        self._ensure_models_loaded()
+        # Generate embeddings for text fields
+        description_embedding, calculated_description_length = self.process_description_embedding(campaign, idx)
+        # Use existing value for description_length if present, otherwise use calculated
+        description_length = campaign.get('description_length', calculated_description_length)
+        # Create processed data dictionary with embeddings and numerical features
+        result = {
+            'description_embedding': description_embedding.tolist(),
+            'description_length': description_length,
+            'blurb_embedding': self.process_blurb(campaign, idx).tolist(),
+            'risk_embedding': self.process_riskandchallenges_embedding(campaign, idx).tolist(),
+            'category_embedding': self.process_category(campaign),
+            'subcategory_embedding': self.process_subcategory_embedding(campaign, idx).tolist(),
+            'country_embedding': self.process_country_embedding(campaign, idx).tolist()
+        }
+        # Process numerical features or use existing values from input
+        numerical_fields = [
+            ('funding_goal', self.process_funding_goal),
+            ('previous_funding_goal', self.process_previous_funding_goal),
+            ('previous_pledged', self.process_previous_pledged),
+            ('previous_success_rate', self.calculate_previous_sucess_rate)
+        ]
+        # Process numerical features or use values from input
+        for field_name, processor_func in numerical_fields:
+            if field_name in campaign:
+                result[field_name] = campaign[field_name]
+            else:
+                result[field_name] = processor_func(campaign, idx)
+        # Simple integer fields
+        for field in ['image_count', 'video_count', 'campaign_duration', 'previous_projects_count']:
+            result[field] = int(campaign.get(field, 0))
+        return result

src/__pycache__/ProcessOneSingleCampaign.cpython-311.pyc ADDED Viewed

Binary file (14.4 kB). View file

src/__pycache__/explainer.cpython-310.pyc ADDED Viewed

Binary file (3.11 kB). View file

src/__pycache__/explainer.cpython-311.pyc ADDED Viewed

Binary file (5.88 kB). View file

src/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (4.3 kB). View file

src/__pycache__/model.cpython-311.pyc ADDED Viewed

Binary file (6.39 kB). View file

src/explainer.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import torch
+import numpy as np
+from typing import Dict, Tuple
+from src.model import KickstarterModel
+class KickstarterExplainer:
+    """Kickstarter prediction model explainer"""
+    def __init__(self, model: KickstarterModel, device: torch.device = None):
+        """
+        Initialize the explainer.
+        Args:
+            model: Trained model.
+            device: Computation device.
+        """
+        self.model = model
+        self.device = device if device is not None else torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model.to(self.device)
+        self.model.eval()
+        # Numerical feature names
+        self.numerical_feature_names = [
+            'description_length',
+            'funding_goal',
+            'image_count',
+            'video_count',
+            'campaign_duration',
+            'previous_projects_count',
+            'previous_success_rate',
+            'previous_pledged',
+            'previous_funding_goal'
+        ]
+        # Mapping from embedding feature names to internal names
+        self.embedding_map = {
+            'description_embedding': 'description_embedding',
+            'blurb_embedding': 'blurb_embedding',
+            'risk_embedding': 'risk_embedding',
+            'subcategory_embedding': 'subcategory_embedding',
+            'category_embedding': 'category_embedding',
+            'country_embedding': 'country_embedding'
+        }
+    def _compute_feature_contribution(self, baseline_probs, inputs, feature_name, is_numerical=False, index=None):
+        # Create input containing only the current feature
+        feature_input = {k: torch.zeros_like(v) for k, v in inputs.items()}
+        if is_numerical:
+            feature_input['numerical_features'] = torch.zeros_like(inputs['numerical_features'])
+            feature_input['numerical_features'][:, index] = inputs['numerical_features'][:, index]
+        else:
+            feature_input[feature_name] = inputs[feature_name]
+        # Prediction
+        with torch.no_grad():
+            feature_probs, _ = self.model(feature_input)
+        # SHAP value is the prediction difference
+        return (feature_probs - baseline_probs).cpu().item()
+    def explain_prediction(self, inputs: Dict[str, torch.Tensor]) -> Tuple[float, Dict[str, float]]:
+        """
+        Explain a single prediction.
+        Args:
+            inputs: Input features.
+        Returns:
+            Predicted probability and SHAP contribution values.
+        """
+        # Move inputs to device
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        # Prediction
+        with torch.no_grad():
+            probs, _ = self.model(inputs)
+        # Calculate SHAP values
+        shap_values = {}
+        baseline = {k: torch.zeros_like(v) for k, v in inputs.items()}
+        # Predict baseline
+        with torch.no_grad():
+            baseline_probs, _ = self.model(baseline)
+        # Calculate SHAP values for embedding features
+        for feature_name, embedding_name in self.embedding_map.items():
+            if embedding_name in inputs:
+                shap_values[feature_name] = self._compute_feature_contribution(
+                    baseline_probs, inputs, embedding_name
+                )
+        # Calculate SHAP values for numerical features
+        if 'numerical_features' in inputs:
+            num_features = inputs['numerical_features'].size(1)
+            for i in range(num_features):
+                feature_name = self.numerical_feature_names[i]
+                shap_values[feature_name] = self._compute_feature_contribution(
+                    baseline_probs, inputs, 'numerical_features',
+                    is_numerical=True, index=i
+                )
+        # Return prediction probability and SHAP values
+        prediction = probs.cpu().item()
+        return prediction, shap_values

src/model.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Dict, Tuple
+class KickstarterModel(nn.Module):
+    """Kickstarter Project Success Prediction Model"""
+    def __init__(
+        self,
+        desc_embedding_dim=768,
+        blurb_embedding_dim=384,
+        risk_embedding_dim=384,
+        subcategory_embedding_dim=100,
+        category_embedding_dim=15,
+        country_embedding_dim=100,
+        numerical_features_dim=9,
+        hidden_dim=512,
+        dropout_rate=0.3
+    ):
+        """
+        Initialize the model
+        Args:
+            desc_embedding_dim: Description embedding vector dimension
+            blurb_embedding_dim: Blurb embedding vector dimension
+            risk_embedding_dim: Risk embedding vector dimension
+            subcategory_embedding_dim: Subcategory embedding vector dimension
+            category_embedding_dim: Category embedding vector dimension
+            country_embedding_dim: Country embedding vector dimension
+            numerical_features_dim: Numerical features dimension
+            hidden_dim: Hidden layer dimension
+            dropout_rate: Dropout rate
+        """
+        super(KickstarterModel, self).__init__()
+        # Helper function to create feature processing layers
+        def create_fc_block(input_dim, output_dim):
+            return nn.Sequential(
+                nn.Linear(input_dim, output_dim),
+                nn.BatchNorm1d(output_dim),
+                nn.ReLU(),
+                nn.Dropout(dropout_rate)
+            )
+        # Feature processing layers
+        self.desc_fc = create_fc_block(desc_embedding_dim, hidden_dim)
+        self.blurb_fc = create_fc_block(blurb_embedding_dim, hidden_dim // 2)
+        self.risk_fc = create_fc_block(risk_embedding_dim, hidden_dim // 2)
+        self.subcategory_fc = create_fc_block(subcategory_embedding_dim, hidden_dim // 4)
+        self.category_fc = create_fc_block(category_embedding_dim, hidden_dim // 8)
+        self.country_fc = create_fc_block(country_embedding_dim, hidden_dim // 8)
+        self.numerical_fc = create_fc_block(numerical_features_dim, hidden_dim // 4)
+        # Combined features dimension
+        concat_dim = (hidden_dim +
+                     hidden_dim // 2 +
+                     hidden_dim // 2 +
+                     hidden_dim // 4 +
+                     hidden_dim // 8 +
+                     hidden_dim // 8 +
+                     hidden_dim // 4)
+        # Fully connected layers
+        self.fc1 = create_fc_block(concat_dim, hidden_dim)
+        self.fc2 = create_fc_block(hidden_dim, hidden_dim // 2)
+        # Output layer
+        self.output = nn.Linear(hidden_dim // 2, 1)
+        # Input names for SHAP explanation
+        self.input_names = [
+            'description_embedding',
+            'blurb_embedding',
+            'risk_embedding',
+            'subcategory_embedding',
+            'category_embedding',
+            'country_embedding',
+            'numerical_features'
+        ]
+    def forward(self, inputs: Dict[str, torch.Tensor]) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """
+        Forward propagation
+        Args:
+            inputs: Dictionary containing all input features
+        Returns:
+            Prediction probability and intermediate feature representations
+        """
+        # Process embeddings
+        desc_out = self.desc_fc(inputs['description_embedding'])
+        blurb_out = self.blurb_fc(inputs['blurb_embedding'])
+        risk_out = self.risk_fc(inputs['risk_embedding'])
+        subcategory_out = self.subcategory_fc(inputs['subcategory_embedding'])
+        category_out = self.category_fc(inputs['category_embedding'])
+        country_out = self.country_fc(inputs['country_embedding'])
+        numerical_out = self.numerical_fc(inputs['numerical_features'])
+        # Concatenate all features
+        combined = torch.cat([
+            desc_out,
+            blurb_out,
+            risk_out,
+            subcategory_out,
+            category_out,
+            country_out,
+            numerical_out
+        ], dim=1)
+        # Fully connected layers
+        x = self.fc1(combined)
+        x = self.fc2(x)
+        # Output layer
+        logits = self.output(x)
+        probs = torch.sigmoid(logits)
+        # Store intermediate features for SHAP explanation
+        intermediate_features = {
+            'description_embedding': desc_out,
+            'blurb_embedding': blurb_out,
+            'risk_embedding': risk_out,
+            'subcategory_embedding': subcategory_out,
+            'category_embedding': category_out,
+            'country_embedding': country_out,
+            'numerical_features': numerical_out,
+            'combined': combined,
+            'fc1': x
+        }
+        return probs.squeeze(1), intermediate_features
+    def predict(self, inputs: Dict[str, torch.Tensor]) -> torch.Tensor:
+        """
+        Prediction function
+        Args:
+            inputs: Dictionary containing all input features
+        Returns:
+            Prediction probability
+        """
+        self.eval()
+        with torch.no_grad():
+            probs, _ = self.forward(inputs)
+        return probs

test.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import requests
+import json
+import pprint
+import time
+import sys
+import os
+import numpy as np
+def check_internet_connectivity():
+    """Check if we can connect to the internet"""
+    print("Testing internet connectivity...")
+    try:
+        response = requests.get("https://huggingface.co", timeout=5)
+        print(f"Connection to huggingface.co: Status {response.status_code}")
+        return response.status_code == 200
+    except Exception as e:
+        print(f"Error connecting to huggingface.co: {str(e)}")
+        return False
+def check_model_repository():
+    """Check if we can connect to the specific model repository"""
+    print("Testing connection to model repository...")
+    try:
+        # Try to access the model repository
+        url = "https://huggingface.co/allenai/longformer-base-4096"
+        response = requests.get(url, timeout=5)
+        print(f"Connection to model repository: Status {response.status_code}")
+        return response.status_code == 200
+    except Exception as e:
+        print(f"Error connecting to model repository: {str(e)}")
+        return False
+def check_debug_endpoint(api_url):
+    """Check the debug endpoint for diagnostic information"""
+    print(f"Checking debug endpoint at {api_url.replace('/predict', '/debug')}...")
+    try:
+        response = requests.get(api_url.replace("/predict", "/debug"), timeout=10)
+        if response.status_code == 200:
+            debug_info = response.json()
+            print("Debug information retrieved:")
+            print(f"- API Status: {debug_info.get('api_status', 'Unknown')}")
+            print(f"- Model Loaded: {debug_info.get('model_loaded', 'Unknown')}")
+            print(f"- Cache Directory Exists: {debug_info.get('model_cache_exists', 'Unknown')}")
+            print(f"- Temp Directory Writable: {debug_info.get('tmp_directory_writable', 'Unknown')}")
+            # Check internet connectivity from the server
+            internet_check = debug_info.get('internet_connectivity', {})
+            print(f"- Server Internet Connectivity: {internet_check.get('status', 'Unknown')}")
+            if internet_check.get('message'):
+                print(f"  Message: {internet_check.get('message')}")
+            # Check tokenizer test
+            tokenizer_test = debug_info.get('tokenizer_test', {})
+            print(f"- Tokenizer Test: {tokenizer_test.get('status', 'Unknown')}")
+            if tokenizer_test.get('message'):
+                print(f"  Message: {tokenizer_test.get('message')}")
+            # Check disk space
+            disk_space = debug_info.get('disk_space', {})
+            if disk_space.get('status') == 'ok':
+                print(f"- Disk Space: Total: {disk_space.get('total_gb', 0):.2f} GB, Used: {disk_space.get('used_gb', 0):.2f} GB, Free: {disk_space.get('free_gb', 0):.2f} GB ({disk_space.get('percent_used', 0):.1f}% used)")
+            return debug_info
+        else:
+            print(f"Error accessing debug endpoint: Status {response.status_code}")
+            print(response.text)
+            return None
+    except Exception as e:
+        print(f"Exception when accessing debug endpoint: {str(e)}")
+        return None
+# API endpoint on Hugging Face Spaces
+API_URL = "https://angusfung-kickstarter-success-prediction.hf.space/predict"
+# Sample input data (similar to what would be in input.json)
+campaign_data = {
+  "raw_description": "Introducing the AquaGo: The Smart, Eco-Friendly Portable Water Purifier! Clean water is a basic human right — yet for millions around the world, it's a daily struggle. Whether you're an outdoor adventurer, traveling to remote areas, or preparing for emergencies, access to safe drinking water should never be a compromise. That's why we created **AquaGo**, a revolutionary portable water purifier that combines cutting-edge filtration technology, smart sensors, and sustainable materials — all packed into a sleek, lightweight design you can take anywhere.",
+  "raw_blurb": "AquaGo is a smart, eco-friendly portable water purifier that delivers clean, safe drinking water anywhere.",
+  "raw_risks": "Bringing a product to market involves complex engineering, regulatory approvals, and safety testing. Delays may occur due to certification or supply chain issues.",
+  "raw_subcategory": "Gadgets",
+  "raw_category": "Technology",
+  "raw_country": "Canada",
+  "funding_goal": 2000,
+  "image_count": 8,
+  "video_count": 3,
+  "campaign_duration": 90,
+  "previous_projects_count": 5,
+  "previous_success_rate": 0.4,
+  "previous_pledged": 18745.33,
+  "previous_funding_goal": 23564.99
+}
+def predict_success(data, max_retries=3, retry_delay=10):
+    """Send data to the API and get prediction results with retries"""
+    for attempt in range(max_retries):
+        try:
+            # Make the POST request to the API
+            print(f"Sending request to: {API_URL} (Attempt {attempt + 1}/{max_retries})")
+            response = requests.post(API_URL, json=data, timeout=60)
+            # Check if the request was successful
+            if response.status_code == 200:
+                return response.json()
+            else:
+                print(f"Error: {response.status_code}")
+                print(response.text)
+                if response.status_code == 500 and "Can't load tokenizer" in response.text:
+                    print(f"The model might be downloading. Waiting {retry_delay} seconds before retry...")
+                    time.sleep(retry_delay)
+                else:
+                    # For other errors, don't retry
+                    return None
+        except Exception as e:
+            print(f"Exception occurred: {str(e)}")
+            print(f"Waiting {retry_delay} seconds before retry...")
+            time.sleep(retry_delay)
+    return None
+def display_results(results):
+    """Display the prediction results in a user-friendly way"""
+    if not results:
+        print("No results to display.")
+        return
+    print("\n===== KICKSTARTER SUCCESS PREDICTION =====\n")
+    print(f"Success Probability: {results['success_probability']:.2%}")
+    print(f"Predicted Outcome: {results['predicted_outcome']}")
+    print("\n----- TOP INFLUENCING FACTORS -----")
+    # Get the top 5 factors by absolute magnitude
+    top_factors = sorted(
+        results['shap_values'].items(),
+        key=lambda x: abs(float(x[1])),
+        reverse=True
+    )[:5]
+    for factor, value in top_factors:
+        impact = "POSITIVE" if float(value) > 0 else "NEGATIVE"
+        print(f"{factor}: {value:.4f} ({impact})")
+    print("\n----- ALL SHAP VALUES -----")
+    pp = pprint.PrettyPrinter(indent=2)
+    pp.pprint(results['shap_values'])
+    # Display Longformer embedding information if available
+    if 'longformer_embedding' in results:
+        embedding = np.array(results['longformer_embedding'])
+        print("\n----- LONGFORMER EMBEDDING -----")
+        print(f"Embedding Shape: {embedding.shape if hasattr(embedding, 'shape') else len(embedding)}")
+        print(f"First 10 values: {embedding[:10]}")
+        # Calculate some basic statistics on the embedding
+        try:
+            embedding_np = np.array(embedding)
+            print(f"Mean: {np.mean(embedding_np):.4f}")
+            print(f"Std: {np.std(embedding_np):.4f}")
+            print(f"Min: {np.min(embedding_np):.4f}")
+            print(f"Max: {np.max(embedding_np):.4f}")
+        except Exception as e:
+            print(f"Error calculating embedding statistics: {str(e)}")
+# Main execution
+if __name__ == "__main__":
+    print("==== DIAGNOSTICS ====")
+    print("Testing connectivity from client machine...")
+    internet_ok = check_internet_connectivity()
+    repo_ok = check_model_repository()
+    debug_info = check_debug_endpoint(API_URL)
+    print("\n==== PREDICTION TEST ====")
+    if not internet_ok:
+        print("WARNING: Internet connectivity issues detected on client machine.")
+    if not repo_ok:
+        print("WARNING: Cannot access model repository from client machine.")
+    print("Sending prediction request...")
+    results = predict_success(campaign_data, max_retries=2, retry_delay=10)
+    display_results(results)