Update src/ProcessOneSingleCampaign.py
Browse files- src/ProcessOneSingleCampaign.py +131 -4
src/ProcessOneSingleCampaign.py
CHANGED
|
@@ -53,6 +53,88 @@ class CampaignProcessor:
|
|
| 53 |
self.categories = sorted(list(set(camp.get('raw_category', '') for camp in self.data)))
|
| 54 |
self.lazy_load = lazy_load
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
# Initialize model variables (to be loaded later)
|
| 57 |
self.tokenizer = None # Longformer tokenizer for descriptions
|
| 58 |
self.model = None # Longformer model for descriptions
|
|
@@ -365,24 +447,69 @@ class CampaignProcessor:
|
|
| 365 |
print(f"Error processing subcategory: {str(e)}")
|
| 366 |
return np.zeros(100)
|
| 367 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 368 |
def process_country_embedding(self, campaign: Dict, idx: int) -> np.ndarray:
|
| 369 |
"""
|
| 370 |
Process the project country to generate a GloVe embedding.
|
| 371 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
Args:
|
| 373 |
campaign (Dict): Campaign data
|
| 374 |
idx (int): Index of the campaign
|
| 375 |
|
| 376 |
Returns:
|
| 377 |
-
np.ndarray: GloVe embedding of the country
|
| 378 |
"""
|
| 379 |
self._ensure_models_loaded()
|
| 380 |
|
| 381 |
try:
|
| 382 |
-
country
|
| 383 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 384 |
except Exception as e:
|
| 385 |
-
print(f"Error processing country: {str(e)}")
|
| 386 |
return np.zeros(100)
|
| 387 |
|
| 388 |
def process_funding_goal(self, campaign: Dict, idx: int) -> float:
|
|
|
|
| 53 |
self.categories = sorted(list(set(camp.get('raw_category', '') for camp in self.data)))
|
| 54 |
self.lazy_load = lazy_load
|
| 55 |
|
| 56 |
+
# Country name to ISO alpha-2 code mapping
|
| 57 |
+
# First letter of each word is capitalized in the original data
|
| 58 |
+
self.country_to_alpha2 = {
|
| 59 |
+
# Main ISO country names
|
| 60 |
+
"United States": "US",
|
| 61 |
+
"United Kingdom": "GB",
|
| 62 |
+
"Canada": "CA",
|
| 63 |
+
"Australia": "AU",
|
| 64 |
+
"New Zealand": "NZ",
|
| 65 |
+
"Germany": "DE",
|
| 66 |
+
"France": "FR",
|
| 67 |
+
"Italy": "IT",
|
| 68 |
+
"Spain": "ES",
|
| 69 |
+
"Netherlands": "NL",
|
| 70 |
+
"Sweden": "SE",
|
| 71 |
+
"Denmark": "DK",
|
| 72 |
+
"Norway": "NO",
|
| 73 |
+
"Ireland": "IE",
|
| 74 |
+
"Switzerland": "CH",
|
| 75 |
+
"Austria": "AT",
|
| 76 |
+
"Belgium": "BE",
|
| 77 |
+
"Luxembourg": "LU",
|
| 78 |
+
"Hong Kong": "HK",
|
| 79 |
+
"Singapore": "SG",
|
| 80 |
+
"Mexico": "MX",
|
| 81 |
+
"Japan": "JP",
|
| 82 |
+
"China": "CN",
|
| 83 |
+
"Brazil": "BR",
|
| 84 |
+
"India": "IN",
|
| 85 |
+
"South Korea": "KR",
|
| 86 |
+
"South Africa": "ZA",
|
| 87 |
+
"Argentina": "AR",
|
| 88 |
+
"Poland": "PL",
|
| 89 |
+
"Portugal": "PT",
|
| 90 |
+
"Russia": "RU",
|
| 91 |
+
"Greece": "GR",
|
| 92 |
+
"Czech Republic": "CZ",
|
| 93 |
+
"Finland": "FI",
|
| 94 |
+
"Hungary": "HU",
|
| 95 |
+
"Romania": "RO",
|
| 96 |
+
"Thailand": "TH",
|
| 97 |
+
"Turkey": "TR",
|
| 98 |
+
"Ukraine": "UA",
|
| 99 |
+
"Colombia": "CO",
|
| 100 |
+
"Chile": "CL",
|
| 101 |
+
"Peru": "PE",
|
| 102 |
+
"Malaysia": "MY",
|
| 103 |
+
"Vietnam": "VN",
|
| 104 |
+
"Indonesia": "ID",
|
| 105 |
+
"Philippines": "PH",
|
| 106 |
+
"United Arab Emirates": "AE",
|
| 107 |
+
"Saudi Arabia": "SA",
|
| 108 |
+
"Israel": "IL",
|
| 109 |
+
"Egypt": "EG",
|
| 110 |
+
"Nigeria": "NG",
|
| 111 |
+
"Kenya": "KE",
|
| 112 |
+
|
| 113 |
+
# Common variants and abbreviations
|
| 114 |
+
"USA": "US",
|
| 115 |
+
"U.S.A.": "US",
|
| 116 |
+
"U.S.": "US",
|
| 117 |
+
"UK": "GB",
|
| 118 |
+
"U.K.": "GB",
|
| 119 |
+
"Great Britain": "GB",
|
| 120 |
+
"England": "GB",
|
| 121 |
+
"Republic Of Korea": "KR",
|
| 122 |
+
"Korea": "KR",
|
| 123 |
+
"Republic Of China": "CN",
|
| 124 |
+
"Republic Of India": "IN",
|
| 125 |
+
"UAE": "AE",
|
| 126 |
+
"Russia": "RU",
|
| 127 |
+
"Russian Federation": "RU",
|
| 128 |
+
"The Netherlands": "NL",
|
| 129 |
+
"Holland": "NL",
|
| 130 |
+
"Republic Of Ireland": "IE",
|
| 131 |
+
"Czech": "CZ",
|
| 132 |
+
"Czechia": "CZ",
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
# Create a lowercase version of the dictionary for case-insensitive lookups
|
| 136 |
+
self.country_to_alpha2_lower = {k.lower(): v for k, v in self.country_to_alpha2.items()}
|
| 137 |
+
|
| 138 |
# Initialize model variables (to be loaded later)
|
| 139 |
self.tokenizer = None # Longformer tokenizer for descriptions
|
| 140 |
self.model = None # Longformer model for descriptions
|
|
|
|
| 447 |
print(f"Error processing subcategory: {str(e)}")
|
| 448 |
return np.zeros(100)
|
| 449 |
|
| 450 |
+
def _convert_country_to_alpha2(self, country_name: str) -> str:
|
| 451 |
+
"""
|
| 452 |
+
Convert a country name to its ISO alpha-2 code.
|
| 453 |
+
|
| 454 |
+
This helper method handles the conversion with proper logging:
|
| 455 |
+
1. Tries exact match first
|
| 456 |
+
2. Falls back to case-insensitive match
|
| 457 |
+
3. Returns original string if no match found
|
| 458 |
+
|
| 459 |
+
Args:
|
| 460 |
+
country_name (str): Country name to convert
|
| 461 |
+
|
| 462 |
+
Returns:
|
| 463 |
+
str: ISO alpha-2 code (e.g., "US") or original country name if no match
|
| 464 |
+
"""
|
| 465 |
+
if not country_name:
|
| 466 |
+
return ""
|
| 467 |
+
|
| 468 |
+
# Try exact match first
|
| 469 |
+
alpha2_code = self.country_to_alpha2.get(country_name)
|
| 470 |
+
|
| 471 |
+
# If no exact match, try case-insensitive match
|
| 472 |
+
if not alpha2_code:
|
| 473 |
+
alpha2_code = self.country_to_alpha2_lower.get(country_name.lower())
|
| 474 |
+
|
| 475 |
+
# Log results
|
| 476 |
+
if alpha2_code:
|
| 477 |
+
print(f"Country conversion: '{country_name}' → '{alpha2_code}'")
|
| 478 |
+
return alpha2_code
|
| 479 |
+
else:
|
| 480 |
+
print(f"Country conversion failed: '{country_name}' not found in dictionary")
|
| 481 |
+
return country_name
|
| 482 |
+
|
| 483 |
def process_country_embedding(self, campaign: Dict, idx: int) -> np.ndarray:
|
| 484 |
"""
|
| 485 |
Process the project country to generate a GloVe embedding.
|
| 486 |
|
| 487 |
+
This method:
|
| 488 |
+
1. Extracts the country name from campaign data
|
| 489 |
+
2. Converts full country name to ISO alpha-2 code (e.g., "United States" → "US")
|
| 490 |
+
3. Generates an embedding using GloVe for the standardized country code
|
| 491 |
+
|
| 492 |
Args:
|
| 493 |
campaign (Dict): Campaign data
|
| 494 |
idx (int): Index of the campaign
|
| 495 |
|
| 496 |
Returns:
|
| 497 |
+
np.ndarray: GloVe embedding of the country (as alpha-2 code)
|
| 498 |
"""
|
| 499 |
self._ensure_models_loaded()
|
| 500 |
|
| 501 |
try:
|
| 502 |
+
# Extract country name from campaign data
|
| 503 |
+
country_name = campaign.get('raw_country', '')
|
| 504 |
+
|
| 505 |
+
# Convert to alpha-2 code using helper method
|
| 506 |
+
alpha2_code = self._convert_country_to_alpha2(country_name)
|
| 507 |
+
|
| 508 |
+
# Generate embedding using standardized country code
|
| 509 |
+
return self._get_glove_embedding(alpha2_code)
|
| 510 |
+
|
| 511 |
except Exception as e:
|
| 512 |
+
print(f"Error processing country for campaign {idx}: {str(e)}")
|
| 513 |
return np.zeros(100)
|
| 514 |
|
| 515 |
def process_funding_goal(self, campaign: Dict, idx: int) -> float:
|