Spaces:

Bohaska
/

ns_issue_search

Running

App Files Files Community

Bohaska commited on Sep 11

Commit

7392937

1 Parent(s): a8087d6

update GA resolution scripts to use API

Browse files

Files changed (7) hide show

ns_ga_resolutions_loose_bge-m3.npy +2 -2
ns_ga_resolutions_semantic_bge-m3.npy +2 -2
parsed_ga_resolutions.json +0 -0
small_scripts/ga_resolutions.json +0 -0
small_scripts/make_embedding/embedding_ga_resolutions.py +104 -54
small_scripts/make_embedding/embeddings_manifest.json +804 -0
small_scripts/parse_ga_resolutions.py +202 -218

ns_ga_resolutions_loose_bge-m3.npy CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:55e9bf59aa6262ef5d81918d14cefca4667dfc5d41847670c7118622e832c275
-size 3418831

 version https://git-lfs.github.com/spec/v1
+oid sha256:1df930cf80b890d10c3f95f9a4fd520d16a7d3a76726bf871e394345cf6d6e11
+size 3555265

ns_ga_resolutions_semantic_bge-m3.npy CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7a3e1e4def2ec2cd87a3c11fabb8af6e2457251c120619ffc29940c938c100e4
-size 1595520

 version https://git-lfs.github.com/spec/v1
+oid sha256:76872864f28f8812bad14c5e1bcf48bf513dbd760bec937385bf8cfa4cc45de4
+size 1642624

parsed_ga_resolutions.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

small_scripts/ga_resolutions.json DELETED Viewed

The diff for this file is too large to render. See raw diff

small_scripts/make_embedding/embedding_ga_resolutions.py CHANGED Viewed

@@ -13,85 +13,135 @@ MODEL_PATH = '../../../../Downloads/bge-m3'
 # Path to the input JSON file for GA resolutions.
 GA_RESOLUTIONS_JSON_PATH = os.path.join(script_dir, '..', '..', 'parsed_ga_resolutions.json')
-# Output directory for the generated embedding files.
-# Assuming output files should go to the parent directory of this script.
 OUTPUT_DIR = os.path.join(script_dir, '..', '..')
 # --- Main Embedding Function ---
-def encode_ga_resolutions():
     print("Initializing BGEM3FlagModel...")
     try:
         model = BGEM3FlagModel(MODEL_PATH, use_fp16=True)
         print("Model loaded.")
     except Exception as e:
         print(f"Error loading model from {MODEL_PATH}: {e}")
-        print("Please ensure the model is downloaded to the specified path.")
         return
-    print(f"Loading GA resolutions from: {GA_RESOLUTIONS_JSON_PATH}")
     try:
-        with open(GA_RESOLUTIONS_JSON_PATH, 'r', encoding='utf-8') as file:
-            resolutions_data = json.load(file)
-    except FileNotFoundError:
-        print(f"Error: GA resolutions JSON file not found at {GA_RESOLUTIONS_JSON_PATH}")
-        return
-    except json.JSONDecodeError as e:
-        print(f"Error decoding JSON from {GA_RESOLUTIONS_JSON_PATH}: {e}")
-        return
     except Exception as e:
-        print(f"An unexpected error occurred while loading {GA_RESOLUTIONS_JSON_PATH}: {e}")
-        return
-    # Extract the 'body' of each resolution to be encoded
-    resolutions_text = [r['body'] for r in resolutions_data if 'body' in r and r['body'].strip()]
-    if not resolutions_text:
-        print("No valid resolution bodies found to encode. Exiting.")
         return
-    print(f"Found {len(resolutions_text)} GA resolutions to encode.")
-    print("Encoding resolutions (dense, sparse)...") # <--- Updated print statement
     try:
-        embeddings = model.encode(resolutions_text,
-                                  batch_size=8, # Adjust batch_size based on your GPU/CPU memory
-                                  max_length=8192, # Max length of input sequence
-                                  return_dense=True,
-                                  return_sparse=True, # This will return 'lexical_weights' for BGE-M3
-                                  return_colbert_vecs=False) # <--- REMOVED COLBERT GENERATION
         # Ensure output directory exists
         os.makedirs(OUTPUT_DIR, exist_ok=True)
-        # --- Save Semantic (Dense) Embeddings ---
-        dense_embeddings = embeddings['dense_vecs']
-        dense_output_path = os.path.join(OUTPUT_DIR, 'ns_ga_resolutions_semantic_bge-m3.npy') # Renamed file
-        np.save(dense_output_path, dense_embeddings)
-        print(f"Saved semantic embeddings to {dense_output_path} (Shape: {dense_embeddings.shape})") # Renamed type and file
-        # --- Save Loose (Sparse) Embeddings ---
-        # 'lexical_weights' is a list of dictionaries, one for each item in the batch
-        sparse_list_of_dicts = embeddings['lexical_weights']
-        # Save this list of sparse dictionaries as a NumPy object array
-        sparse_output_path = os.path.join(OUTPUT_DIR, 'ns_ga_resolutions_loose_bge-m3.npy') # Renamed file
-        np.save(sparse_output_path, np.array(sparse_list_of_dicts, dtype=object), allow_pickle=True) # allow_pickle is essential for storing Python objects
-        print(f"Saved loose embeddings to {sparse_output_path} (Total objects: {len(sparse_list_of_dicts)})") # Renamed type and file
-        # --- Removed ColBERT Embeddings Saving ---
-        # colbert_list_of_arrays = embeddings['colbert_vecs']
-        # colbert_output_path = os.path.join(OUTPUT_DIR, 'ns_ga_resolutions_colbert_bge-m3.npy')
-        # np.save(colbert_output_path, np.array(colbert_list_of_arrays, dtype=object), allow_pickle=True)
-        # print(f"Saved ColBERT embeddings to {colbert_output_path} (Total objects: {len(colbert_list_of_arrays)})")
-        print("\nGA Resolution embedding generation complete!")
     except Exception as e:
-        print(f"An error occurred during embedding generation: {e}")
-        import traceback
-        traceback.print_exc() # Print full traceback for debugging
 # Call the function to start the embedding process
 if __name__ == "__main__":
-    encode_ga_resolutions()

 # Path to the input JSON file for GA resolutions.
 GA_RESOLUTIONS_JSON_PATH = os.path.join(script_dir, '..', '..', 'parsed_ga_resolutions.json')
+# Output directory for the generated files.
 OUTPUT_DIR = os.path.join(script_dir, '..', '..')
+# --- Output and Cache File Paths ---
+DENSE_OUTPUT_PATH = os.path.join(OUTPUT_DIR, 'ns_ga_resolutions_semantic_bge-m3.npy')
+SPARSE_OUTPUT_PATH = os.path.join(OUTPUT_DIR, 'ns_ga_resolutions_loose_bge-m3.npy')
+MANIFEST_PATH = os.path.join(script_dir, 'embeddings_manifest.json')  # New manifest file
 # --- Main Embedding Function ---
+def encode_ga_resolutions_with_caching():
+    # 1. --- Load the source of truth: all resolutions ---
+    print(f"Loading all GA resolutions from: {GA_RESOLUTIONS_JSON_PATH}")
+    try:
+        with open(GA_RESOLUTIONS_JSON_PATH, 'r', encoding='utf-8') as file:
+            all_resolutions_data = json.load(file)
+            # Filter out resolutions without a valid body
+            all_resolutions_data = [
+                r for r in all_resolutions_data if 'id' in r and 'body' in r and r['body'].strip()
+            ]
+    except (FileNotFoundError, json.JSONDecodeError, Exception) as e:
+        print(f"Fatal Error: Could not load or parse the source resolutions file. Cannot proceed. Error: {e}")
+        return
+    # 2. --- Load existing cache (manifest and embeddings) ---
+    cached_manifest = {}
+    old_dense_embeddings = None
+    old_sparse_embeddings = None
+    if os.path.exists(MANIFEST_PATH) and os.path.exists(DENSE_OUTPUT_PATH) and os.path.exists(SPARSE_OUTPUT_PATH):
+        print("Found existing cache. Loading manifest and embeddings.")
+        try:
+            with open(MANIFEST_PATH, 'r', encoding='utf-8') as f:
+                cached_manifest = json.load(f)
+                # Convert string keys from JSON back to integers if necessary
+                cached_manifest = {int(k): v for k, v in cached_manifest.items()}
+            old_dense_embeddings = np.load(DENSE_OUTPUT_PATH)
+            old_sparse_embeddings = np.load(SPARSE_OUTPUT_PATH, allow_pickle=True)
+            print(f"Successfully loaded cache for {len(cached_manifest)} resolutions.")
+        except Exception as e:
+            print(f"Warning: Could not load cache files correctly: {e}. Re-embedding all resolutions.")
+            cached_manifest = {}  # Reset if cache is corrupt
+    else:
+        print("No existing cache found. Will generate embeddings for all resolutions.")
+    # 3. --- Identify new resolutions to be encoded ---
+    all_res_ids = {r['id'] for r in all_resolutions_data}
+    cached_res_ids = set(cached_manifest.keys())
+    new_res_ids = all_res_ids - cached_res_ids
+    if not new_res_ids:
+        print("All resolutions are already embedded. Nothing to do. Exiting.")
+        return
+    print(f"Found {len(new_res_ids)} new resolutions to embed.")
+    resolutions_to_encode = [r for r in all_resolutions_data if r['id'] in new_res_ids]
+    # Sort by ID to ensure a consistent order
+    resolutions_to_encode.sort(key=lambda x: x['id'])
+    new_texts = [r['body'] for r in resolutions_to_encode]
+    # 4. --- Initialize model and encode ONLY the new data ---
     print("Initializing BGEM3FlagModel...")
     try:
         model = BGEM3FlagModel(MODEL_PATH, use_fp16=True)
         print("Model loaded.")
     except Exception as e:
         print(f"Error loading model from {MODEL_PATH}: {e}")
         return
+    print(f"Encoding {len(new_texts)} new resolutions (dense, sparse)...")
     try:
+        new_embeddings = model.encode(new_texts,
+                                      batch_size=8,
+                                      max_length=8192,
+                                      return_dense=True,
+                                      return_sparse=True,
+                                      return_colbert_vecs=False)
     except Exception as e:
+        print(f"An error occurred during embedding generation: {e}")
+        import traceback
+        traceback.print_exc()
         return
+    # 5. --- Combine old and new embeddings ---
+    new_dense_vecs = new_embeddings['dense_vecs']
+    new_sparse_list = new_embeddings['lexical_weights']
+    new_sparse_vecs = np.array(new_sparse_list, dtype=object)
+    if old_dense_embeddings is not None and old_sparse_embeddings is not None:
+        print("Combining new embeddings with cached ones...")
+        combined_dense_embeddings = np.vstack([old_dense_embeddings, new_dense_vecs])
+        combined_sparse_embeddings = np.concatenate([old_sparse_embeddings, new_sparse_vecs])
+    else:
+        # This branch is for the first run when no cache exists
+        combined_dense_embeddings = new_dense_vecs
+        combined_sparse_embeddings = new_sparse_vecs
+    # 6. --- Update manifest and save everything ---
+    print("Updating manifest file...")
+    start_index = len(cached_manifest)
+    updated_manifest = cached_manifest.copy()
+    for i, res in enumerate(resolutions_to_encode):
+        updated_manifest[res['id']] = start_index + i
     try:
         # Ensure output directory exists
         os.makedirs(OUTPUT_DIR, exist_ok=True)
+        # Save combined embeddings
+        np.save(DENSE_OUTPUT_PATH, combined_dense_embeddings)
+        print(f"Saved combined semantic embeddings to {DENSE_OUTPUT_PATH} (Shape: {combined_dense_embeddings.shape})")
+        np.save(SPARSE_OUTPUT_PATH, combined_sparse_embeddings, allow_pickle=True)
+        print(
+            f"Saved combined loose embeddings to {SPARSE_OUTPUT_PATH} (Total objects: {len(combined_sparse_embeddings)})")
+        # Save the updated manifest
+        with open(MANIFEST_PATH, 'w', encoding='utf-8') as f:
+            json.dump(updated_manifest, f, indent=2)
+        print(f"Saved updated manifest to {MANIFEST_PATH}")
+        print("\nGA Resolution embedding process complete!")
     except Exception as e:
+        print(f"An error occurred while saving the files: {e}")
 # Call the function to start the embedding process
 if __name__ == "__main__":
+    encode_ga_resolutions_with_caching()

small_scripts/make_embedding/embeddings_manifest.json ADDED Viewed

	@@ -0,0 +1,804 @@

+{
+  "1": 0,
+  "2": 1,
+  "3": 2,
+  "4": 3,
+  "5": 4,
+  "6": 5,
+  "7": 6,
+  "8": 7,
+  "9": 8,
+  "10": 9,
+  "11": 10,
+  "12": 11,
+  "13": 12,
+  "14": 13,
+  "15": 14,
+  "16": 15,
+  "17": 16,
+  "18": 17,
+  "19": 18,
+  "20": 19,
+  "21": 20,
+  "22": 21,
+  "23": 22,
+  "24": 23,
+  "25": 24,
+  "26": 25,
+  "27": 26,
+  "28": 27,
+  "29": 28,
+  "30": 29,
+  "31": 30,
+  "32": 31,
+  "33": 32,
+  "34": 33,
+  "35": 34,
+  "36": 35,
+  "37": 36,
+  "38": 37,
+  "39": 38,
+  "40": 39,
+  "41": 40,
+  "42": 41,
+  "43": 42,
+  "44": 43,
+  "45": 44,
+  "46": 45,
+  "47": 46,
+  "48": 47,
+  "49": 48,
+  "50": 49,
+  "53": 50,
+  "54": 51,
+  "55": 52,
+  "57": 53,
+  "59": 54,
+  "61": 55,
+  "62": 56,
+  "63": 57,
+  "65": 58,
+  "66": 59,
+  "67": 60,
+  "68": 61,
+  "69": 62,
+  "70": 63,
+  "71": 64,
+  "72": 65,
+  "74": 66,
+  "75": 67,
+  "77": 68,
+  "78": 69,
+  "79": 70,
+  "81": 71,
+  "84": 72,
+  "87": 73,
+  "90": 74,
+  "92": 75,
+  "93": 76,
+  "94": 77,
+  "95": 78,
+  "96": 79,
+  "97": 80,
+  "99": 81,
+  "100": 82,
+  "102": 83,
+  "104": 84,
+  "106": 85,
+  "108": 86,
+  "110": 87,
+  "111": 88,
+  "113": 89,
+  "114": 90,
+  "115": 91,
+  "116": 92,
+  "118": 93,
+  "121": 94,
+  "122": 95,
+  "123": 96,
+  "124": 97,
+  "125": 98,
+  "126": 99,
+  "127": 100,
+  "128": 101,
+  "129": 102,
+  "130": 103,
+  "132": 104,
+  "133": 105,
+  "134": 106,
+  "135": 107,
+  "137": 108,
+  "139": 109,
+  "141": 110,
+  "142": 111,
+  "143": 112,
+  "145": 113,
+  "146": 114,
+  "148": 115,
+  "149": 116,
+  "151": 117,
+  "154": 118,
+  "156": 119,
+  "157": 120,
+  "159": 121,
+  "161": 122,
+  "163": 123,
+  "165": 124,
+  "166": 125,
+  "168": 126,
+  "170": 127,
+  "172": 128,
+  "173": 129,
+  "175": 130,
+  "177": 131,
+  "178": 132,
+  "179": 133,
+  "180": 134,
+  "181": 135,
+  "183": 136,
+  "184": 137,
+  "185": 138,
+  "186": 139,
+  "188": 140,
+  "190": 141,
+  "193": 142,
+  "197": 143,
+  "199": 144,
+  "200": 145,
+  "202": 146,
+  "203": 147,
+  "204": 148,
+  "206": 149,
+  "207": 150,
+  "208": 151,
+  "210": 152,
+  "211": 153,
+  "213": 154,
+  "216": 155,
+  "219": 156,
+  "220": 157,
+  "221": 158,
+  "224": 159,
+  "225": 160,
+  "228": 161,
+  "229": 162,
+  "230": 163,
+  "233": 164,
+  "235": 165,
+  "237": 166,
+  "238": 167,
+  "239": 168,
+  "242": 169,
+  "244": 170,
+  "246": 171,
+  "248": 172,
+  "249": 173,
+  "250": 174,
+  "252": 175,
+  "254": 176,
+  "255": 177,
+  "256": 178,
+  "257": 179,
+  "259": 180,
+  "260": 181,
+  "261": 182,
+  "263": 183,
+  "266": 184,
+  "267": 185,
+  "269": 186,
+  "270": 187,
+  "272": 188,
+  "275": 189,
+  "277": 190,
+  "278": 191,
+  "279": 192,
+  "280": 193,
+  "281": 194,
+  "282": 195,
+  "283": 196,
+  "284": 197,
+  "287": 198,
+  "289": 199,
+  "291": 200,
+  "292": 201,
+  "293": 202,
+  "295": 203,
+  "296": 204,
+  "297": 205,
+  "299": 206,
+  "301": 207,
+  "304": 208,
+  "306": 209,
+  "308": 210,
+  "309": 211,
+  "310": 212,
+  "311": 213,
+  "312": 214,
+  "314": 215,
+  "315": 216,
+  "316": 217,
+  "317": 218,
+  "319": 219,
+  "320": 220,
+  "321": 221,
+  "324": 222,
+  "326": 223,
+  "328": 224,
+  "329": 225,
+  "330": 226,
+  "332": 227,
+  "333": 228,
+  "335": 229,
+  "336": 230,
+  "338": 231,
+  "339": 232,
+  "340": 233,
+  "341": 234,
+  "343": 235,
+  "344": 236,
+  "346": 237,
+  "347": 238,
+  "348": 239,
+  "349": 240,
+  "350": 241,
+  "351": 242,
+  "352": 243,
+  "354": 244,
+  "356": 245,
+  "357": 246,
+  "358": 247,
+  "360": 248,
+  "362": 249,
+  "364": 250,
+  "367": 251,
+  "370": 252,
+  "371": 253,
+  "372": 254,
+  "373": 255,
+  "375": 256,
+  "377": 257,
+  "378": 258,
+  "379": 259,
+  "380": 260,
+  "381": 261,
+  "382": 262,
+  "384": 263,
+  "385": 264,
+  "386": 265,
+  "389": 266,
+  "390": 267,
+  "391": 268,
+  "392": 269,
+  "395": 270,
+  "396": 271,
+  "397": 272,
+  "398": 273,
+  "400": 274,
+  "402": 275,
+  "403": 276,
+  "407": 277,
+  "408": 278,
+  "409": 279,
+  "411": 280,
+  "413": 281,
+  "415": 282,
+  "416": 283,
+  "418": 284,
+  "419": 285,
+  "420": 286,
+  "423": 287,
+  "431": 288,
+  "432": 289,
+  "435": 290,
+  "438": 291,
+  "439": 292,
+  "441": 293,
+  "442": 294,
+  "443": 295,
+  "445": 296,
+  "447": 297,
+  "449": 298,
+  "450": 299,
+  "455": 300,
+  "458": 301,
+  "460": 302,
+  "461": 303,
+  "464": 304,
+  "466": 305,
+  "473": 306,
+  "474": 307,
+  "476": 308,
+  "477": 309,
+  "479": 310,
+  "480": 311,
+  "481": 312,
+  "483": 313,
+  "484": 314,
+  "485": 315,
+  "487": 316,
+  "488": 317,
+  "490": 318,
+  "491": 319,
+  "492": 320,
+  "493": 321,
+  "495": 322,
+  "497": 323,
+  "503": 324,
+  "504": 325,
+  "505": 326,
+  "506": 327,
+  "507": 328,
+  "509": 329,
+  "510": 330,
+  "512": 331,
+  "514": 332,
+  "515": 333,
+  "517": 334,
+  "518": 335,
+  "519": 336,
+  "520": 337,
+  "521": 338,
+  "522": 339,
+  "523": 340,
+  "524": 341,
+  "525": 342,
+  "526": 343,
+  "527": 344,
+  "528": 345,
+  "529": 346,
+  "531": 347,
+  "534": 348,
+  "536": 349,
+  "537": 350,
+  "538": 351,
+  "541": 352,
+  "543": 353,
+  "544": 354,
+  "546": 355,
+  "548": 356,
+  "549": 357,
+  "550": 358,
+  "551": 359,
+  "553": 360,
+  "555": 361,
+  "557": 362,
+  "559": 363,
+  "560": 364,
+  "562": 365,
+  "563": 366,
+  "564": 367,
+  "565": 368,
+  "566": 369,
+  "567": 370,
+  "568": 371,
+  "570": 372,
+  "572": 373,
+  "573": 374,
+  "576": 375,
+  "577": 376,
+  "578": 377,
+  "579": 378,
+  "581": 379,
+  "583": 380,
+  "585": 381,
+  "587": 382,
+  "588": 383,
+  "589": 384,
+  "593": 385,
+  "596": 386,
+  "597": 387,
+  "599": 388,
+  "601": 389,
+  "602": 390,
+  "603": 391,
+  "609": 392,
+  "612": 393,
+  "613": 394,
+  "614": 395,
+  "615": 396,
+  "616": 397,
+  "618": 398,
+  "621": 399,
+  "623": 400,
+  "625": 401,
+  "627": 402,
+  "628": 403,
+  "629": 404,
+  "631": 405,
+  "637": 406,
+  "639": 407,
+  "641": 408,
+  "642": 409,
+  "644": 410,
+  "646": 411,
+  "648": 412,
+  "649": 413,
+  "651": 414,
+  "653": 415,
+  "654": 416,
+  "655": 417,
+  "658": 418,
+  "660": 419,
+  "661": 420,
+  "665": 421,
+  "666": 422,
+  "667": 423,
+  "668": 424,
+  "670": 425,
+  "676": 426,
+  "677": 427,
+  "683": 428,
+  "685": 429,
+  "686": 430,
+  "687": 431,
+  "690": 432,
+  "692": 433,
+  "694": 434,
+  "696": 435,
+  "697": 436,
+  "698": 437,
+  "699": 438,
+  "700": 439,
+  "701": 440,
+  "704": 441,
+  "705": 442,
+  "707": 443,
+  "708": 444,
+  "709": 445,
+  "711": 446,
+  "712": 447,
+  "714": 448,
+  "715": 449,
+  "717": 450,
+  "719": 451,
+  "727": 452,
+  "730": 453,
+  "731": 454,
+  "733": 455,
+  "734": 456,
+  "735": 457,
+  "736": 458,
+  "738": 459,
+  "739": 460,
+  "741": 461,
+  "742": 462,
+  "743": 463,
+  "744": 464,
+  "748": 465,
+  "751": 466,
+  "752": 467,
+  "753": 468,
+  "757": 469,
+  "758": 470,
+  "759": 471,
+  "760": 472,
+  "761": 473,
+  "762": 474,
+  "763": 475,
+  "766": 476,
+  "770": 477,
+  "772": 478,
+  "773": 479,
+  "775": 480,
+  "777": 481,
+  "780": 482,
+  "782": 483,
+  "783": 484,
+  "787": 485,
+  "790": 486,
+  "795": 487,
+  "797": 488,
+  "798": 489,
+  "801": 490,
+  "803": 491,
+  "805": 492,
+  "806": 493,
+  "807": 494,
+  "810": 495,
+  "811": 496,
+  "812": 497,
+  "813": 498,
+  "814": 499,
+  "815": 500,
+  "818": 501,
+  "820": 502,
+  "821": 503,
+  "823": 504,
+  "825": 505,
+  "827": 506,
+  "828": 507,
+  "830": 508,
+  "832": 509,
+  "834": 510,
+  "836": 511,
+  "839": 512,
+  "840": 513,
+  "841": 514,
+  "843": 515,
+  "844": 516,
+  "846": 517,
+  "847": 518,
+  "848": 519,
+  "850": 520,
+  "852": 521,
+  "854": 522,
+  "855": 523,
+  "856": 524,
+  "858": 525,
+  "860": 526,
+  "862": 527,
+  "863": 528,
+  "867": 529,
+  "869": 530,
+  "870": 531,
+  "873": 532,
+  "874": 533,
+  "875": 534,
+  "876": 535,
+  "879": 536,
+  "881": 537,
+  "883": 538,
+  "885": 539,
+  "886": 540,
+  "888": 541,
+  "890": 542,
+  "891": 543,
+  "893": 544,
+  "894": 545,
+  "895": 546,
+  "897": 547,
+  "899": 548,
+  "900": 549,
+  "902": 550,
+  "903": 551,
+  "905": 552,
+  "907": 553,
+  "909": 554,
+  "911": 555,
+  "913": 556,
+  "914": 557,
+  "915": 558,
+  "916": 559,
+  "917": 560,
+  "918": 561,
+  "920": 562,
+  "923": 563,
+  "924": 564,
+  "926": 565,
+  "929": 566,
+  "930": 567,
+  "932": 568,
+  "934": 569,
+  "936": 570,
+  "937": 571,
+  "939": 572,
+  "942": 573,
+  "943": 574,
+  "944": 575,
+  "946": 576,
+  "947": 577,
+  "949": 578,
+  "952": 579,
+  "954": 580,
+  "956": 581,
+  "958": 582,
+  "959": 583,
+  "960": 584,
+  "961": 585,
+  "964": 586,
+  "966": 587,
+  "967": 588,
+  "970": 589,
+  "972": 590,
+  "974": 591,
+  "976": 592,
+  "978": 593,
+  "979": 594,
+  "981": 595,
+  "982": 596,
+  "984": 597,
+  "985": 598,
+  "988": 599,
+  "990": 600,
+  "992": 601,
+  "993": 602,
+  "998": 603,
+  "999": 604,
+  "1000": 605,
+  "1001": 606,
+  "1004": 607,
+  "1006": 608,
+  "1008": 609,
+  "1009": 610,
+  "1012": 611,
+  "1014": 612,
+  "1016": 613,
+  "1021": 614,
+  "1024": 615,
+  "1025": 616,
+  "1026": 617,
+  "1027": 618,
+  "1030": 619,
+  "1033": 620,
+  "1036": 621,
+  "1039": 622,
+  "1040": 623,
+  "1042": 624,
+  "1044": 625,
+  "1045": 626,
+  "1048": 627,
+  "1049": 628,
+  "1052": 629,
+  "1054": 630,
+  "1056": 631,
+  "1058": 632,
+  "1059": 633,
+  "1060": 634,
+  "1061": 635,
+  "1062": 636,
+  "1063": 637,
+  "1065": 638,
+  "1067": 639,
+  "1068": 640,
+  "1070": 641,
+  "1072": 642,
+  "1074": 643,
+  "1077": 644,
+  "1078": 645,
+  "1079": 646,
+  "1081": 647,
+  "1083": 648,
+  "1086": 649,
+  "1088": 650,
+  "1091": 651,
+  "1092": 652,
+  "1093": 653,
+  "1095": 654,
+  "1096": 655,
+  "1097": 656,
+  "1100": 657,
+  "1102": 658,
+  "1104": 659,
+  "1106": 660,
+  "1108": 661,
+  "1109": 662,
+  "1111": 663,
+  "1113": 664,
+  "1115": 665,
+  "1118": 666,
+  "1120": 667,
+  "1122": 668,
+  "1124": 669,
+  "1126": 670,
+  "1130": 671,
+  "1132": 672,
+  "1134": 673,
+  "1136": 674,
+  "1138": 675,
+  "1139": 676,
+  "1141": 677,
+  "1143": 678,
+  "1144": 679,
+  "1146": 680,
+  "1148": 681,
+  "1150": 682,
+  "1151": 683,
+  "1152": 684,
+  "1154": 685,
+  "1156": 686,
+  "1158": 687,
+  "1160": 688,
+  "1161": 689,
+  "1163": 690,
+  "1166": 691,
+  "1168": 692,
+  "1171": 693,
+  "1173": 694,
+  "1176": 695,
+  "1178": 696,
+  "1180": 697,
+  "1182": 698,
+  "1184": 699,
+  "1186": 700,
+  "1188": 701,
+  "1190": 702,
+  "1191": 703,
+  "1193": 704,
+  "1195": 705,
+  "1197": 706,
+  "1198": 707,
+  "1199": 708,
+  "1201": 709,
+  "1203": 710,
+  "1205": 711,
+  "1207": 712,
+  "1208": 713,
+  "1209": 714,
+  "1211": 715,
+  "1212": 716,
+  "1214": 717,
+  "1216": 718,
+  "1217": 719,
+  "1218": 720,
+  "1220": 721,
+  "1223": 722,
+  "1224": 723,
+  "1225": 724,
+  "1228": 725,
+  "1229": 726,
+  "1230": 727,
+  "1231": 728,
+  "1232": 729,
+  "1233": 730,
+  "1234": 731,
+  "1235": 732,
+  "1236": 733,
+  "1237": 734,
+  "1240": 735,
+  "1242": 736,
+  "1244": 737,
+  "1245": 738,
+  "1246": 739,
+  "1247": 740,
+  "1248": 741,
+  "1250": 742,
+  "1252": 743,
+  "1254": 744,
+  "1257": 745,
+  "1258": 746,
+  "1259": 747,
+  "1260": 748,
+  "1263": 749,
+  "1266": 750,
+  "1270": 751,
+  "1272": 752,
+  "1273": 753,
+  "1275": 754,
+  "1277": 755,
+  "1278": 756,
+  "1280": 757,
+  "1281": 758,
+  "1283": 759,
+  "1285": 760,
+  "1287": 761,
+  "1291": 762,
+  "1293": 763,
+  "1296": 764,
+  "1297": 765,
+  "1298": 766,
+  "1299": 767,
+  "1300": 768,
+  "1303": 769,
+  "1304": 770,
+  "1305": 771,
+  "1307": 772,
+  "1310": 773,
+  "1312": 774,
+  "1315": 775,
+  "1316": 776,
+  "1317": 777,
+  "1318": 778,
+  "1324": 779,
+  "1325": 780,
+  "1327": 781,
+  "1329": 782,
+  "1330": 783,
+  "1332": 784,
+  "1333": 785,
+  "1336": 786,
+  "1338": 787,
+  "1339": 788,
+  "1340": 789,
+  "1341": 790,
+  "1343": 791,
+  "1345": 792,
+  "1347": 793,
+  "1350": 794,
+  "1351": 795,
+  "1352": 796,
+  "1354": 797,
+  "1355": 798,
+  "1357": 799,
+  "1359": 800,
+  "1361": 801
+}

small_scripts/parse_ga_resolutions.py CHANGED Viewed

@@ -1,228 +1,212 @@
 import json
-import re
-from bs4 import BeautifulSoup
-from markdownify import markdownify as md
-def parse_resolution_html(html_string):
     """
-    Parses a single HTML string representing a World Assembly resolution
-    into a structured Python dictionary.
     Args:
-        html_string: The HTML string of a single resolution block.
     Returns:
-        A dictionary representing the resolution data, or None if parsing fails.
     """
-    soup = BeautifulSoup(html_string, 'html.parser')
-    data = {}
-    # Find the main resolution container div
-    thing_div = soup.find('div', class_='WA_thing')
-    if not thing_div:
-        # If the main container isn't found, it's not a valid resolution block
-        print("Warning: Could not find the main 'WA_thing' div.")
         return None
-    # --- Status (Repealed) ---
-    # Check if the resolution has been repealed
-    repealed_div = thing_div.find('div', class_='WA_thing_repealed')
-    if repealed_div:
-        data['status'] = 'Repealed'
-        # Extract info about the repealing resolution if available
-        repeal_line = repealed_div.find('p', class_='WA_thing_repealline')
-        if repeal_line:
-            repealer_link = repeal_line.find('a')
-            if repealer_link:
-                data['repealed_by'] = {
-                    'id': repealer_link.text.strip().replace('Repealed by GA#', '').strip(), # Extract just the number
-                    'link': repealer_link.get('href')
-                }
     else:
-        data['status'] = 'Active'
-    # --- Header ---
-    header_div = thing_div.find('div', class_='WA_thing_header')
-    if header_div:
-        # Raw title (e.g., "General Assembly Resolution # 769")
-        rtitle_tag = header_div.find('p', class_='WA_rtitle')
-        if rtitle_tag:
-            data['raw_title'] = rtitle_tag.text.strip()
-        # Main title and link (contains ID and Council)
-        title_link_tag = header_div.find('h2').find('a') if header_div.find('h2') else None
-        if title_link_tag:
-            data['title'] = title_link_tag.text.strip()
-            href = title_link_tag.get('href')
-            data['link'] = href
-            # Extract ID and Council from the link
-            match = re.search(r'id=(\d+)/council=(\d+)', href)
-            if match:
-                data['id'] = int(match.group(1))
-                data['council'] = int(match.group(2))
-        # Description (the paragraph without a specific class in the header)
-        # Find all p tags in header and take the last one that isn't rtitle or repealline
-        all_ps_in_header = header_div.find_all('p')
-        description_tag = None
-        for p_tag in reversed(all_ps_in_header):
-             if 'WA_rtitle' not in p_tag.get('class', []) and 'WA_thing_repealline' not in p_tag.get('class', []):
-                  description_tag = p_tag
-                  break
-        if description_tag:
-             data['description'] = description_tag.text.strip()
-    # --- Info Box (Category, Area, Strength, Proposed by, Repeals info) ---
-    rbox_div = thing_div.find('div', class_='WA_thing_rbox')
-    if rbox_div:
-        # Iterate through paragraphs in the info box
-        for p_tag in rbox_div.find_all('p'):
-            leader_span = p_tag.find('span', class_='WA_leader')
-            if leader_span:
-                label = leader_span.text.strip().replace(':', '')
-                # Get the text content after the leader span
-                content_text = leader_span.next_sibling
-                if content_text:
-                     content_text = content_text.strip()
-                if label == 'Category':
-                    data['category'] = content_text
-                elif label == 'Area of Effect':
-                    data['area_of_effect'] = content_text
-                elif label == 'Strength':
-                     data['strength'] = content_text
-                elif label == 'Resolution': # This tag appears specifically for Repeal resolutions
-                     link_tag = p_tag.find('a')
-                     if link_tag:
-                          data['repeals'] = {
-                               'ga_id': link_tag.text.strip().replace('GA#', ''),
-                               'link': link_tag.get('href')
-                          }
-                elif label == 'Proposed by':
-                     nation_link_tag = p_tag.find('a', class_='nlink')
-                     if nation_link_tag:
-                          nation_name_span = nation_link_tag.find('span', class_='nnameblock')
-                          if nation_name_span:
-                               data['proposed_by'] = {
-                                    'name': nation_name_span.text.strip(),
-                                    'link': nation_link_tag.get('href')
-                               }
-    # --- Body HTML ---
-    body_div = thing_div.find('div', class_='WA_thing_body')
-    if body_div:
-        # Get the inner HTML content while preserving tags
-        data['body'] = md(body_div.decode_contents().strip())
-    # --- Co-authors ---
-    # Co-authors paragraph is a sibling immediately after the body div
-    coauthors_p = body_div.find_next_sibling('p') if body_div else None
-    if coauthors_p and coauthors_p.find('span', class_='WA_leader', string='Co-authors:'):
-         coauthors_list = []
-         # Find all nation links within this paragraph
-         for a_tag in coauthors_p.find_all('a', class_='nlink'):
-              nname_span = a_tag.find('span', class_='nnameblock')
-              if nname_span:
-                   coauthors_list.append({
-                        'name': nname_span.text.strip(),
-                        'link': a_tag.get('href')
-                   })
-         if coauthors_list:
-              data['co_authors'] = coauthors_list
-    # --- Vote Counts and Dates ---
-    presbottom_div = thing_div.find('div', class_='WApresbottom')
-    if presbottom_div:
-        # Passed/Repealed Dates (in floatrightbox)
-        floatrightbox = presbottom_div.find('div', class_='floatrightbox')
-        if floatrightbox:
-            # Passed date
-            passed_leader_p = floatrightbox.find('p', class_='WA_leader', string='Passed:')
-            if passed_leader_p:
-                # Navigate up to the <td>, then find the next sibling <td>, then find the <p> inside it, then the <time>
-                passed_leader_td = passed_leader_p.find_parent('td')
-                if passed_leader_td: # Add a check here too just in case the structure is unexpected
-                    date_td = passed_leader_td.find_next_sibling('td')
-                    if date_td: # Check if the next <td> exists
-                        date_p = date_td.find('p') # Find the paragraph inside that <td>
-                        if date_p: # Check if the paragraph exists
-                            passed_time_tag = date_p.find('time') # Find the time tag inside that paragraph
-                            if passed_time_tag:
-                                data['passed_date'] = {
-                                    'datetime': passed_time_tag.get('datetime'),
-                                    'text': passed_time_tag.text.strip()
-                                }
-            # Repealed date (only present if repealed)
-            # Apply similar robust navigation here
-            repealed_leader_p = floatrightbox.find('p', class_='WA_leader')
-            if repealed_leader_p and repealed_leader_p.find('a', string='Repealed:'):
-                 repealed_leader_td = repealed_leader_p.find_parent('td')
-                 if repealed_leader_td:
-                      date_td = repealed_leader_td.find_next_sibling('td')
-                      if date_td:
-                           date_p = date_td.find('p')
-                           if date_p:
-                                repealed_time_tag = date_p.find('time')
-                                if repealed_time_tag:
-                                    # Ensure status is marked repealed even if WA_thing_repealed div was missed
-                                    if 'status' not in data or data['status'] != 'Repealed':
-                                        data['status'] = 'Repealed'
-                                    data['repealed_date'] = {
-                                        'datetime': repealed_time_tag.get('datetime'),
-                                        'text': repealed_time_tag.text.strip()
-                                    }
-        # Vote Counts (in WA_votecount table)
-        # This part of the logic seems mostly correct because you're navigating cell by cell within the row
-        votecount_table = presbottom_div.find('table', class_='WA_votecount')
-        if votecount_table:
-            for row in votecount_table.find_all('tr'):
-                leader_cell = row.find('p', class_='WA_leader')
-                if leader_cell:
-                    label = leader_cell.text.strip().replace(':', '')
-                    if label in ['For', 'Against']:
-                        # Find the cells for count and percentage relative to the leader cell
-                        # These navigations (find_parent('td').find_next_sibling('td')) are correct
-                        count_cell = leader_cell.find_parent('td').find_next_sibling('td')
-                        percentage_cell = count_cell.find_next_sibling('td') if count_cell else None
-                        count_text = count_cell.find('span', class_='bigtext').text.strip().replace(',', '') if count_cell and count_cell.find('span', class_='bigtext') else '0'
-                        percentage_text = percentage_cell.find('span', class_='smalltext').text.strip().replace('%', '') if percentage_cell and percentage_cell.find('span', class_='smalltext') else '0'
-                        try:
-                            data[label.lower() + '_votes'] = int(count_text)
-                        except ValueError:
-                            data[label.lower() + '_votes'] = 0 # Handle potential parsing errors
-                        try:
-                             data[label.lower() + '_percentage'] = float(percentage_text)
-                        except ValueError:
-                             data[label.lower() + '_percentage'] = 0.0
-    return data
-def __main__():
-    resolutions = open("ga_resolutions.json", "r")
-    html_resolutions = json.load(resolutions)
-    resolutions.close()
-    json_resolutions = []
-    for resolution in html_resolutions:
-        json_resolutions.append(parse_resolution_html(resolution))
-    output = open("../parsed_ga_resolutions.json", "w")
-    json.dump(json_resolutions, output)
-__main__()
-def format_resoluton(resolution):
-    title = "<resolution>\n"
-    if resolution['status'] == "REPEALED":
-        title = f"[Repealed by GA#{resolution['repealed_by']} "
-    title += f"GA#{resolution['id']} {resolution['title']}"
-    return title + "\n\n" + resolution['body'] + "\n</resolution>"

+import requests
+import xml.etree.ElementTree as ET
 import json
+import time
+import os
+# --- Configuration ---
+# Replace with your own nation name or contact info.
+USER_AGENT = "NS Issue Search dev update script (Jiangbei)"
+CACHE_FILE = "../parsed_ga_resolutions.json"
+API_BASE_URL = "https://www.nationstates.net/cgi-bin/api.cgi"
+COUNCIL_ID = 1  # 1 for General Assembly, 2 for Security Council
+def load_cache(filename):
+    """Loads existing resolutions from the JSON cache file."""
+    if not os.path.exists(filename):
+        print(f"Cache file '{filename}' not found. Will start from scratch.")
+        return {}
+    try:
+        with open(filename, 'r', encoding='utf-8') as f:
+            resolutions_list = json.load(f)
+            # Convert list to a dictionary keyed by resolution ID for fast lookups
+            return {res['id']: res for res in resolutions_list}
+    except (json.JSONDecodeError, IOError) as e:
+        print(f"Error reading cache file '{filename}': {e}. Starting from scratch.")
+        return {}
+def save_cache(filename, resolutions_dict):
+    """Saves the resolutions dictionary to the JSON cache file."""
+    try:
+        # Convert the dictionary values back to a list and sort by ID
+        sorted_resolutions = sorted(resolutions_dict.values(), key=lambda r: r['id'])
+        with open(filename, 'w', encoding='utf-8') as f:
+            json.dump(sorted_resolutions, f, indent=2)
+        print(f"Successfully saved {len(sorted_resolutions)} resolutions to '{filename}'.")
+    except IOError as e:
+        print(f"Error writing to cache file '{filename}': {e}")
+def parse_resolution_xml(xml_string):
     """
+    Parses a single XML string from the NationStates API into a structured dictionary.
     Args:
+        xml_string: The XML content from the API response.
     Returns:
+        A dictionary representing the resolution data, or None if parsing fails or resolution is empty.
     """
+    try:
+        root = ET.fromstring(xml_string)
+        res_node = root.find('RESOLUTION')
+        # If the RESOLUTION tag is empty, it means the resolution doesn't exist.
+        if res_node is None or not list(res_node):
+            return None
+        data = {}
+        # Iterate through all direct child tags of <RESOLUTION>
+        for child in res_node:
+            # Special case for COAUTHOR, which has multiple <N> children
+            if child.tag == 'COAUTHOR':
+                co_authors = [n.text for n in child.findall('N')]
+                if co_authors:
+                    data['co_authors'] = co_authors
+                continue  # Skip to the next tag
+            key = child.tag.lower()
+            value = child.text
+            # Try to convert numeric values to integers
+            try:
+                data[key] = int(value)
+            except (ValueError, TypeError):
+                data[key] = value
+        # --- Map API fields to desired dictionary structure ---
+        # Keep required fields with consistent naming
+        if 'name' in data: data['title'] = data.pop('name')
+        if 'resid' in data: data['id'] = data.pop('resid')
+        if 'desc' in data: data['body'] = data.pop('desc')  # Keep BBCode as text
+        if 'councilid' in data: data['council'] = data.pop('councilid')
+        # Determine status and structure repeal information
+        if 'repealed_by' in data:
+            data['status'] = 'Repealed'
+            data['repealed_by'] = {
+                'id': data.pop('repealed_by'),
+                'timestamp': data.pop('repealed', None)
+            }
+        else:
+            data['status'] = 'Active'
+        # Structure info for resolutions that ARE repeals
+        if 'repeals_resid' in data:
+            data['repeals'] = {
+                'id': data.pop('repeals_resid'),
+                'council': data.pop('repeals_councilid')
+            }
+        return data
+    except ET.ParseError as e:
+        print(f"Error parsing XML: {e}")
         return None
+def main():
+    """Main function to fetch, parse, and cache resolutions."""
+    print("--- World Assembly Resolution Fetcher ---")
+    # Load existing resolutions from cache
+    cached_resolutions = load_cache(CACHE_FILE)
+    if cached_resolutions:
+        # Find the latest resolution ID we already have and start from the next one
+        start_id = max(cached_resolutions.keys()) + 1
+        print(f"Loaded {len(cached_resolutions)} resolutions from cache. Starting fetch from GA#{start_id}.")
     else:
+        start_id = 1
+    # --- API Request Loop ---
+    session = requests.Session()
+    session.headers.update({'User-Agent': USER_AGENT})
+    current_id = start_id
+    newly_fetched = []
+    rate_limit_info = {
+        'remaining': 50,
+        'reset_in': 30
+    }
+    while True:
+        # Check if we are about to exceed the rate limit
+        if rate_limit_info['remaining'] < 2:
+            wait_time = rate_limit_info['reset_in'] + 1  # Add a small buffer
+            print(f"Rate limit approaching. Waiting for {wait_time} seconds...")
+            time.sleep(wait_time)
+        print(f"Fetching resolution GA#{current_id}...")
+        params = {'wa': COUNCIL_ID, 'id': current_id, 'q': 'resolution'}
+        try:
+            response = session.get(API_BASE_URL, params=params, timeout=15)
+            # Update rate limit info from headers after every request
+            rate_limit_info['remaining'] = int(response.headers.get('RateLimit-Remaining', 50))
+            rate_limit_info['reset_in'] = int(response.headers.get('RateLimit-Reset', 30))
+            # Handle API responses
+            if response.status_code == 429:
+                retry_after = int(response.headers.get('Retry-After', 30))
+                print(f"Rate limit exceeded (429). Waiting for {retry_after} seconds as requested by API.")
+                time.sleep(retry_after)
+                continue  # Retry the same ID
+            response.raise_for_status()  # Raises an error for other bad responses (4xx or 5xx)
+        except requests.exceptions.RequestException as e:
+            print(f"An error occurred during request for GA#{current_id}: {e}")
+            print("Stopping script. Run again to resume.")
+            break
+        # Parse the response content
+        parsed_data = parse_resolution_xml(response.text)
+        if parsed_data:
+            newly_fetched.append(parsed_data)
+            current_id += 1
+            time.sleep(0.7)  # Be polite: 50 requests/30s = 0.6s per request. Add a small delay.
+        else:
+            # API returns empty <RESOLUTION> for non-existent IDs, signaling we are done.
+            print(f"GA#{current_id} does not exist. Assuming it's the last one.")
+            print("--- Fetching complete. ---")
+            break
+    # --- Post-Fetch Processing ---
+    if not newly_fetched:
+        print("No new resolutions found. Cache is up-to-date.")
+        return
+    print(f"Fetched {len(newly_fetched)} new resolutions.")
+    # Update cache with new data
+    updates_made = 0
+    for res in newly_fetched:
+        # Check if this new resolution repeals an older one
+        if res['status'] == 'Repealed' and res.get('repealed_by'):
+            repealed_id = res['id']
+            # Check if we have the repealed resolution in our cache
+            if repealed_id in cached_resolutions and cached_resolutions[repealed_id]['status'] == 'Active':
+                print(
+                    f"Updating status for GA#{repealed_id}: was Active, now Repealed by GA#{res['repealed_by']['id']}.")
+                cached_resolutions[repealed_id]['status'] = 'Repealed'
+                cached_resolutions[repealed_id]['repealed_by'] = res['repealed_by']
+                updates_made += 1
+        # Add the new resolution to our collection
+        cached_resolutions[res['id']] = res
+    if updates_made:
+        print(f"Updated the status of {updates_made} existing resolutions.")
+    # Save the final, complete collection to the cache file
+    save_cache(CACHE_FILE, cached_resolutions)
+if __name__ == "__main__":
+    main()