Spaces:
Sleeping
Sleeping
def flatten_list(lst): | |
""" | |
Flattens a nested list into a single list. If the input is not nested, it returns the original list. | |
Handles cases where some elements are lists and others are not. | |
""" | |
if not isinstance(lst, list): | |
raise ValueError("You must provide a valid list") | |
def _flatten(sublist): | |
for item in sublist: | |
if isinstance(item, list): | |
yield from _flatten(item) | |
else: | |
yield item | |
return list(_flatten(lst)) | |
def flatten_dict(d: dict, parent_key: str = '', sep: str = '_') -> dict: | |
"""Flatten a nested dictionary efficiently. | |
Args: | |
d (dict): The dictionary to flatten. | |
parent_key (str): The base key string to use for the flattened keys. | |
sep (str): The separator to use between parent and child keys. | |
Returns: | |
dict: The flattened dictionary. | |
""" | |
if not isinstance(d, dict): | |
raise ValueError("You must provide a valid dictionary.") | |
def _flatten(d, parent_key): | |
for k, v in d.items(): | |
new_key = f"{parent_key}{sep}{k}" if parent_key else k | |
if isinstance(v, dict): | |
yield from _flatten(v, new_key) | |
else: | |
yield new_key, v | |
return dict(_flatten(d, parent_key)) | |
def filter_dict_by_keys(original_dict, relevant_keys): | |
""" | |
Filters a dictionary to include only the key-value pairs where the key is in relevant_keys. | |
Args: | |
original_dict (dict): The dictionary to filter. | |
relevant_keys (set): The set of keys to keep. | |
Returns: | |
dict: A filtered dictionary containing only the relevant key-value pairs. | |
""" | |
return {key: original_dict[key] for key in relevant_keys if key in original_dict} | |
from typing import List | |
import pandas as pd | |
def custom_struct_to_df(samples: List[List[pd.DataFrame]]): | |
""" | |
Converts a custom data structure (a list of pairs of DataFrames) into a single consolidated DataFrame. | |
Args: | |
samples (List[List[pd.DataFrame]]): A list of pairs of DataFrames. Each pair consists of: | |
- A preprint DataFrame (e.g., containing information about preprints). | |
- An article DataFrame (e.g., containing information about corresponding articles). | |
Returns: | |
pd.DataFrame: A single DataFrame where: | |
- Each row corresponds to a preprint-article pair. | |
- Preprint columns retain their prefix (e.g., 'prpnt'). | |
- Article columns retain their prefix (e.g., 'article'). | |
- Index is reset for the entire DataFrame. | |
""" | |
return pd.concat([ | |
pd.concat([preprint, article], axis=1) | |
for preprint, article in samples | |
]).reset_index(drop=True) | |
def df_to_custom_struct(df: pd.DataFrame) -> List[List[pd.DataFrame]]: | |
""" | |
Converts a DataFrame with prefixed columns (prpnt for preprint, article for article) | |
into a list of pairs of DataFrames. | |
Args: | |
df (pd.DataFrame): The input DataFrame with columns prefixed by `prpnt` and `article`. | |
Returns: | |
List[List[pd.DataFrame]]: A list of pairs of DataFrames [preprint, article]. | |
""" | |
# Split columns into preprint and article based on prefixes | |
preprint_columns = [col for col in df.columns if col.startswith("prpnt")] | |
article_columns = [col for col in df.columns if col.startswith("article")] | |
# Separate the DataFrame into two DataFrames for preprint and article | |
preprint_df = df[preprint_columns].copy() | |
article_df = df[article_columns].copy() | |
# Combine rows into pairs of DataFrames | |
return [ | |
[preprint_df.iloc[[i]], article_df.iloc[[i]]] # Use iloc to get each row as a DataFrame | |
for i in range(len(df)) | |
] | |