Spaces:

angusfung
/

Kickstarter-prediction-embedding

Sleeping

App Files Files Community

angusfung commited on Apr 22

Commit

c732f23

verified ·

1 Parent(s): 7e4ff82

Update src/ProcessOneSingleCampaign.py

Browse files

Files changed (1) hide show

src/ProcessOneSingleCampaign.py +131 -4

src/ProcessOneSingleCampaign.py CHANGED Viewed

@@ -53,6 +53,88 @@ class CampaignProcessor:
         self.categories = sorted(list(set(camp.get('raw_category', '') for camp in self.data)))
         self.lazy_load = lazy_load
         # Initialize model variables (to be loaded later)
         self.tokenizer = None  # Longformer tokenizer for descriptions
         self.model = None  # Longformer model for descriptions
@@ -365,24 +447,69 @@ class CampaignProcessor:
             print(f"Error processing subcategory: {str(e)}")
             return np.zeros(100)
     def process_country_embedding(self, campaign: Dict, idx: int) -> np.ndarray:
         """
         Process the project country to generate a GloVe embedding.
         Args:
             campaign (Dict): Campaign data
             idx (int): Index of the campaign
         Returns:
-            np.ndarray: GloVe embedding of the country
         """
         self._ensure_models_loaded()
         try:
-            country = campaign.get('raw_country', '')
-            return self._get_glove_embedding(country)
         except Exception as e:
-            print(f"Error processing country: {str(e)}")
             return np.zeros(100)
     def process_funding_goal(self, campaign: Dict, idx: int) -> float:

         self.categories = sorted(list(set(camp.get('raw_category', '') for camp in self.data)))
         self.lazy_load = lazy_load
+        # Country name to ISO alpha-2 code mapping
+        # First letter of each word is capitalized in the original data
+        self.country_to_alpha2 = {
+            # Main ISO country names
+            "United States": "US",
+            "United Kingdom": "GB",
+            "Canada": "CA",
+            "Australia": "AU",
+            "New Zealand": "NZ",
+            "Germany": "DE",
+            "France": "FR",
+            "Italy": "IT",
+            "Spain": "ES",
+            "Netherlands": "NL",
+            "Sweden": "SE",
+            "Denmark": "DK",
+            "Norway": "NO",
+            "Ireland": "IE",
+            "Switzerland": "CH",
+            "Austria": "AT",
+            "Belgium": "BE",
+            "Luxembourg": "LU",
+            "Hong Kong": "HK",
+            "Singapore": "SG",
+            "Mexico": "MX",
+            "Japan": "JP",
+            "China": "CN",
+            "Brazil": "BR",
+            "India": "IN",
+            "South Korea": "KR",
+            "South Africa": "ZA",
+            "Argentina": "AR",
+            "Poland": "PL",
+            "Portugal": "PT",
+            "Russia": "RU",
+            "Greece": "GR",
+            "Czech Republic": "CZ",
+            "Finland": "FI",
+            "Hungary": "HU",
+            "Romania": "RO",
+            "Thailand": "TH",
+            "Turkey": "TR",
+            "Ukraine": "UA",
+            "Colombia": "CO",
+            "Chile": "CL",
+            "Peru": "PE",
+            "Malaysia": "MY",
+            "Vietnam": "VN",
+            "Indonesia": "ID",
+            "Philippines": "PH",
+            "United Arab Emirates": "AE",
+            "Saudi Arabia": "SA",
+            "Israel": "IL",
+            "Egypt": "EG",
+            "Nigeria": "NG",
+            "Kenya": "KE",
+            # Common variants and abbreviations
+            "USA": "US",
+            "U.S.A.": "US",
+            "U.S.": "US",
+            "UK": "GB",
+            "U.K.": "GB",
+            "Great Britain": "GB",
+            "England": "GB",
+            "Republic Of Korea": "KR",
+            "Korea": "KR",
+            "Republic Of China": "CN",
+            "Republic Of India": "IN",
+            "UAE": "AE",
+            "Russia": "RU",
+            "Russian Federation": "RU",
+            "The Netherlands": "NL",
+            "Holland": "NL",
+            "Republic Of Ireland": "IE",
+            "Czech": "CZ",
+            "Czechia": "CZ",
+        }
+        # Create a lowercase version of the dictionary for case-insensitive lookups
+        self.country_to_alpha2_lower = {k.lower(): v for k, v in self.country_to_alpha2.items()}
         # Initialize model variables (to be loaded later)
         self.tokenizer = None  # Longformer tokenizer for descriptions
         self.model = None  # Longformer model for descriptions
             print(f"Error processing subcategory: {str(e)}")
             return np.zeros(100)
+    def _convert_country_to_alpha2(self, country_name: str) -> str:
+        """
+        Convert a country name to its ISO alpha-2 code.
+        This helper method handles the conversion with proper logging:
+        1. Tries exact match first
+        2. Falls back to case-insensitive match
+        3. Returns original string if no match found
+        Args:
+            country_name (str): Country name to convert
+        Returns:
+            str: ISO alpha-2 code (e.g., "US") or original country name if no match
+        """
+        if not country_name:
+            return ""
+        # Try exact match first
+        alpha2_code = self.country_to_alpha2.get(country_name)
+        # If no exact match, try case-insensitive match
+        if not alpha2_code:
+            alpha2_code = self.country_to_alpha2_lower.get(country_name.lower())
+        # Log results
+        if alpha2_code:
+            print(f"Country conversion: '{country_name}' → '{alpha2_code}'")
+            return alpha2_code
+        else:
+            print(f"Country conversion failed: '{country_name}' not found in dictionary")
+            return country_name
     def process_country_embedding(self, campaign: Dict, idx: int) -> np.ndarray:
         """
         Process the project country to generate a GloVe embedding.
+        This method:
+        1. Extracts the country name from campaign data
+        2. Converts full country name to ISO alpha-2 code (e.g., "United States" → "US")
+        3. Generates an embedding using GloVe for the standardized country code
         Args:
             campaign (Dict): Campaign data
             idx (int): Index of the campaign
         Returns:
+            np.ndarray: GloVe embedding of the country (as alpha-2 code)
         """
         self._ensure_models_loaded()
         try:
+            # Extract country name from campaign data
+            country_name = campaign.get('raw_country', '')
+            # Convert to alpha-2 code using helper method
+            alpha2_code = self._convert_country_to_alpha2(country_name)
+            # Generate embedding using standardized country code
+            return self._get_glove_embedding(alpha2_code)
         except Exception as e:
+            print(f"Error processing country for campaign {idx}: {str(e)}")
             return np.zeros(100)
     def process_funding_goal(self, campaign: Dict, idx: int) -> float: