stark-leaderboard

Running

App Files Files Community

Shiyu Zhao commited on Nov 14, 2024

Commit

743ad0c

1 Parent(s): d6d7173

Update space

Browse files

Files changed (1) hide show

app.py +70 -107

app.py CHANGED Viewed

@@ -15,10 +15,10 @@ from huggingface_hub import HfApi
 import shutil
 import tempfile
 import time
-from concurrent.futures import ThreadPoolExecutor
 from queue import Queue
 import threading
 from stark_qa import load_qa
 from stark_qa.evaluator import Evaluator
@@ -32,150 +32,113 @@ try:
 except Exception as e:
     raise RuntimeError(f"Failed to initialize HuggingFace Hub storage: {e}")
 def process_single_instance(args):
-    """Process a single instance with progress tracking"""
     idx, eval_csv, qa_dataset, evaluator, eval_metrics = args
     try:
-        query, query_id, answer_ids, meta_info = qa_dataset[idx]
-        # Print progress for debugging
-        print(f"Processing query_id: {query_id}")
         try:
-            pred_rank = eval_csv[eval_csv['query_id'] == query_id]['pred_rank'].item()
-        except Exception as e:
-            print(f"Error getting pred_rank for query_id {query_id}: {str(e)}")
-            raise
-        if isinstance(pred_rank, str):
             pred_rank = eval(pred_rank)
-        pred_dict = {pred_rank[i]: -i for i in range(min(100, len(pred_rank)))}
-        answer_ids = torch.LongTensor(answer_ids)
-        result = evaluator.evaluate(pred_dict, answer_ids, metrics=eval_metrics)
-        result["idx"], result["query_id"] = idx, query_id
-        return result
-    except Exception as e:
-        print(f"Error in process_single_instance for idx {idx}: {str(e)}")
-        raise
-def compute_metrics(csv_path: str, dataset: str, split: str, num_workers: int = 4):
     candidate_ids_dict = {
         'amazon': [i for i in range(957192)],
         'mag': [i for i in range(1172724, 1872968)],
         'prime': [i for i in range(129375)]
     }
-    start_time = time.time()
     try:
         eval_csv = pd.read_csv(csv_path)
         if 'query_id' not in eval_csv.columns:
             raise ValueError('No `query_id` column found in the submitted csv.')
         if 'pred_rank' not in eval_csv.columns:
             raise ValueError('No `pred_rank` column found in the submitted csv.')
         eval_csv = eval_csv[['query_id', 'pred_rank']]
         if dataset not in candidate_ids_dict:
             raise ValueError(f"Invalid dataset '{dataset}', expected one of {list(candidate_ids_dict.keys())}.")
         if split not in ['test', 'test-0.1', 'human_generated_eval']:
             raise ValueError(f"Invalid split '{split}', expected one of ['test', 'test-0.1', 'human_generated_eval'].")
-        # print("Initializing evaluator...")
         evaluator = Evaluator(candidate_ids_dict[dataset])
         eval_metrics = ['hit@1', 'hit@5', 'recall@20', 'mrr']
-        # print("Loading QA dataset...")
         qa_dataset = load_qa(dataset, human_generated_eval=split == 'human_generated_eval')
         split_idx = qa_dataset.get_idx_split()
         all_indices = split_idx[split].tolist()
-        print(f"Dataset loaded, processing {len(all_indices)} instances")
-        # results_list = []
-        # query_ids = []
-        # # Prepare args for each worker
-        # args = [(idx, eval_csv, qa_dataset, evaluator, eval_metrics) for idx in all_indices]
-        # with ProcessPoolExecutor(max_workers=num_workers) as executor:
-        #     futures = [executor.submit(process_single_instance, arg) for arg in args]
-        #     for future in tqdm(as_completed(futures), total=len(futures)):
-        #         result = future.result()  # This will raise an error if the worker encountered one
-        #         results_list.append(result)
-        #         query_ids.append(result['query_id'])
-        # # Concatenate results and compute final metrics
-        # eval_csv = pd.concat([eval_csv, pd.DataFrame(results_list)], ignore_index=True)
-        # final_results = {
-        #     metric: np.mean(eval_csv[eval_csv['query_id'].isin(query_ids)][metric]) for metric in eval_metrics
-        # }
-        # return final_result
-        batch_size = 100
-        results_list = []
-        progress_queue = Queue()
-        def process_batch(batch_indices):
-            batch_results = []
-            with ThreadPoolExecutor(max_workers=num_workers) as executor:
-                futures = [
-                    executor.submit(process_single_instance,
-                                 (idx, eval_csv, qa_dataset, evaluator, eval_metrics))
-                    for idx in batch_indices
-                ]
-                for future in futures:
                     result = future.result()
-                    if result is not None:
-                        batch_results.append(result)
-                    progress_queue.put(1)
-            return batch_results
-        # Process batches
-        total_batches = (len(all_indices) + batch_size - 1) // batch_size
-        remaining_indices = len(all_indices)
-        def update_progress():
-            with tqdm(total=len(all_indices), desc="Processing instances") as pbar:
-                completed = 0
-                while completed < len(all_indices):
-                    progress_queue.get()
-                    completed += 1
-                    pbar.update(1)
-        # Start progress monitoring thread
-        progress_thread = threading.Thread(target=update_progress)
-        progress_thread.start()
-        # Process batches
-        for i in range(0, len(all_indices), batch_size):
-            batch_indices = all_indices[i:min(i + batch_size, len(all_indices))]
-            batch_results = process_batch(batch_indices)
-            results_list.extend(batch_results)
-            remaining_indices -= len(batch_indices)
-            print(f"\rBatch {i//batch_size + 1}/{total_batches} completed. Remaining: {remaining_indices}")
-        progress_thread.join()
-        # Compute final metrics
-        if not results_list:
-            raise ValueError("No valid results were produced")
-        results_df = pd.DataFrame(results_list)
-        final_results = {
-            metric: results_df[metric].mean()
-            for metric in eval_metrics
-        }
-        elapsed_time = time.time() - start_time
-        print(f"\nMetrics computation completed in {elapsed_time:.2f} seconds")
         return final_results
     except Exception as error:
-        elapsed_time = time.time() - start_time
-        error_msg = f"Error in compute_metrics ({elapsed_time:.2f}s): {str(error)}"
-        print(error_msg)
-        return error_msg

 import shutil
 import tempfile
 import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from queue import Queue
 import threading
+from threading import Lock
 from stark_qa import load_qa
 from stark_qa.evaluator import Evaluator
 except Exception as e:
     raise RuntimeError(f"Failed to initialize HuggingFace Hub storage: {e}")
+# Global lock for thread-safe operations
+result_lock = Lock()
 def process_single_instance(args):
     idx, eval_csv, qa_dataset, evaluator, eval_metrics = args
+    query, query_id, answer_ids, meta_info = qa_dataset[idx]
     try:
+        # Using loc instead of direct boolean indexing for thread safety
+        with result_lock:
+            matching_rows = eval_csv.loc[eval_csv['query_id'] == query_id]
+            if matching_rows.empty:
+                raise IndexError(f'Error when processing query_id={query_id}, please make sure the predicted results exist for this query.')
+            pred_rank = matching_rows['pred_rank'].iloc[0]
+    except IndexError:
+        raise IndexError(f'Error when processing query_id={query_id}, please make sure the predicted results exist for this query.')
+    except Exception as e:
+        raise RuntimeError(f'Unexpected error occurred while fetching prediction rank for query_id={query_id}: {e}')
+    if isinstance(pred_rank, str):
         try:
             pred_rank = eval(pred_rank)
+        except SyntaxError as e:
+            raise ValueError(f'Failed to parse pred_rank as a list for query_id={query_id}: {e}')
+    if not isinstance(pred_rank, list):
+        raise TypeError(f'Error when processing query_id={query_id}, expected pred_rank to be a list but got {type(pred_rank)}.')
+    pred_dict = {pred_rank[i]: -i for i in range(min(100, len(pred_rank)))}
+    answer_ids = torch.LongTensor(answer_ids)
+    # Evaluate metrics
+    result = evaluator.evaluate(pred_dict, answer_ids, metrics=eval_metrics)
+    result["idx"], result["query_id"] = idx, query_id
+    return result
+def compute_metrics(csv_path: str, dataset: str, split: str, num_threads: int = 4):
     candidate_ids_dict = {
         'amazon': [i for i in range(957192)],
         'mag': [i for i in range(1172724, 1872968)],
         'prime': [i for i in range(129375)]
     }
     try:
+        # Read and validate CSV
         eval_csv = pd.read_csv(csv_path)
         if 'query_id' not in eval_csv.columns:
             raise ValueError('No `query_id` column found in the submitted csv.')
         if 'pred_rank' not in eval_csv.columns:
             raise ValueError('No `pred_rank` column found in the submitted csv.')
+        # Filter required columns
         eval_csv = eval_csv[['query_id', 'pred_rank']]
+        # Validate input parameters
         if dataset not in candidate_ids_dict:
             raise ValueError(f"Invalid dataset '{dataset}', expected one of {list(candidate_ids_dict.keys())}.")
         if split not in ['test', 'test-0.1', 'human_generated_eval']:
             raise ValueError(f"Invalid split '{split}', expected one of ['test', 'test-0.1', 'human_generated_eval'].")
+        # Initialize evaluator and metrics
         evaluator = Evaluator(candidate_ids_dict[dataset])
         eval_metrics = ['hit@1', 'hit@5', 'recall@20', 'mrr']
+        # Load dataset and get split indices
         qa_dataset = load_qa(dataset, human_generated_eval=split == 'human_generated_eval')
         split_idx = qa_dataset.get_idx_split()
         all_indices = split_idx[split].tolist()
+        # Thread-safe containers
+        results_list = []
+        query_ids = []
+        results_lock = Lock()
+        # Prepare args for each thread
+        args = [(idx, eval_csv, qa_dataset, evaluator, eval_metrics) for idx in all_indices]
+        # Process using threads
+        with ThreadPoolExecutor(max_workers=num_threads) as executor:
+            futures = [executor.submit(process_single_instance, arg) for arg in args]
+            for future in tqdm(as_completed(futures), total=len(futures)):
+                try:
                     result = future.result()
+                    with results_lock:
+                        results_list.append(result)
+                        query_ids.append(result['query_id'])
+                except Exception as e:
+                    print(f"Error processing instance: {str(e)}")
+        # Concatenate results and compute final metrics
+        with result_lock:
+            results_df = pd.DataFrame(results_list)
+            eval_csv = pd.concat([eval_csv, results_df], ignore_index=True)
+            final_results = {
+                metric: np.mean(eval_csv[eval_csv['query_id'].isin(query_ids)][metric])
+                for metric in eval_metrics
+            }
         return final_results
+    except pd.errors.EmptyDataError:
+        return "Error: The CSV file is empty or could not be read. Please check the file and try again."
+    except FileNotFoundError:
+        return f"Error: The file {csv_path} could not be found. Please check the file path and try again."
     except Exception as error:
+        return f"{error}"