Spaces:
Sleeping
Sleeping
File size: 3,784 Bytes
b5cf002 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
def flatten_list(lst):
"""
Flattens a nested list into a single list. If the input is not nested, it returns the original list.
Handles cases where some elements are lists and others are not.
"""
if not isinstance(lst, list):
raise ValueError("You must provide a valid list")
def _flatten(sublist):
for item in sublist:
if isinstance(item, list):
yield from _flatten(item)
else:
yield item
return list(_flatten(lst))
def flatten_dict(d: dict, parent_key: str = '', sep: str = '_') -> dict:
"""Flatten a nested dictionary efficiently.
Args:
d (dict): The dictionary to flatten.
parent_key (str): The base key string to use for the flattened keys.
sep (str): The separator to use between parent and child keys.
Returns:
dict: The flattened dictionary.
"""
if not isinstance(d, dict):
raise ValueError("You must provide a valid dictionary.")
def _flatten(d, parent_key):
for k, v in d.items():
new_key = f"{parent_key}{sep}{k}" if parent_key else k
if isinstance(v, dict):
yield from _flatten(v, new_key)
else:
yield new_key, v
return dict(_flatten(d, parent_key))
def filter_dict_by_keys(original_dict, relevant_keys):
"""
Filters a dictionary to include only the key-value pairs where the key is in relevant_keys.
Args:
original_dict (dict): The dictionary to filter.
relevant_keys (set): The set of keys to keep.
Returns:
dict: A filtered dictionary containing only the relevant key-value pairs.
"""
return {key: original_dict[key] for key in relevant_keys if key in original_dict}
from typing import List
import pandas as pd
def custom_struct_to_df(samples: List[List[pd.DataFrame]]):
"""
Converts a custom data structure (a list of pairs of DataFrames) into a single consolidated DataFrame.
Args:
samples (List[List[pd.DataFrame]]): A list of pairs of DataFrames. Each pair consists of:
- A preprint DataFrame (e.g., containing information about preprints).
- An article DataFrame (e.g., containing information about corresponding articles).
Returns:
pd.DataFrame: A single DataFrame where:
- Each row corresponds to a preprint-article pair.
- Preprint columns retain their prefix (e.g., 'prpnt').
- Article columns retain their prefix (e.g., 'article').
- Index is reset for the entire DataFrame.
"""
return pd.concat([
pd.concat([preprint, article], axis=1)
for preprint, article in samples
]).reset_index(drop=True)
def df_to_custom_struct(df: pd.DataFrame) -> List[List[pd.DataFrame]]:
"""
Converts a DataFrame with prefixed columns (prpnt for preprint, article for article)
into a list of pairs of DataFrames.
Args:
df (pd.DataFrame): The input DataFrame with columns prefixed by `prpnt` and `article`.
Returns:
List[List[pd.DataFrame]]: A list of pairs of DataFrames [preprint, article].
"""
# Split columns into preprint and article based on prefixes
preprint_columns = [col for col in df.columns if col.startswith("prpnt")]
article_columns = [col for col in df.columns if col.startswith("article")]
# Separate the DataFrame into two DataFrames for preprint and article
preprint_df = df[preprint_columns].copy()
article_df = df[article_columns].copy()
# Combine rows into pairs of DataFrames
return [
[preprint_df.iloc[[i]], article_df.iloc[[i]]] # Use iloc to get each row as a DataFrame
for i in range(len(df))
]
|