File size: 3,784 Bytes
b5cf002
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
def flatten_list(lst):
    """
    Flattens a nested list into a single list. If the input is not nested, it returns the original list.
    Handles cases where some elements are lists and others are not.
    """
    if not isinstance(lst, list):
        raise ValueError("You must provide a valid list")

    def _flatten(sublist):
        for item in sublist:
            if isinstance(item, list):
                yield from _flatten(item)
            else:
                yield item 

    return list(_flatten(lst))

def flatten_dict(d: dict, parent_key: str = '', sep: str = '_') -> dict:
    """Flatten a nested dictionary efficiently.
    
    Args:
        d (dict): The dictionary to flatten.
        parent_key (str): The base key string to use for the flattened keys.
        sep (str): The separator to use between parent and child keys.
        
    Returns:
        dict: The flattened dictionary.
    """
    if not isinstance(d, dict):
        raise ValueError("You must provide a valid dictionary.")
    
    def _flatten(d, parent_key):
        for k, v in d.items():
            new_key = f"{parent_key}{sep}{k}" if parent_key else k
            if isinstance(v, dict):
                yield from _flatten(v, new_key)
            else:
                yield new_key, v
    
    return dict(_flatten(d, parent_key))


def filter_dict_by_keys(original_dict, relevant_keys):
    """
    Filters a dictionary to include only the key-value pairs where the key is in relevant_keys.

    Args:
        original_dict (dict): The dictionary to filter.
        relevant_keys (set): The set of keys to keep.

    Returns:
        dict: A filtered dictionary containing only the relevant key-value pairs.
    """
    return {key: original_dict[key] for key in relevant_keys if key in original_dict}


from typing import List
import pandas as pd

def custom_struct_to_df(samples: List[List[pd.DataFrame]]):
    """
    Converts a custom data structure (a list of pairs of DataFrames) into a single consolidated DataFrame.

    Args:
        samples (List[List[pd.DataFrame]]): A list of pairs of DataFrames. Each pair consists of:
            - A preprint DataFrame (e.g., containing information about preprints).
            - An article DataFrame (e.g., containing information about corresponding articles).

    Returns:
        pd.DataFrame: A single DataFrame where:
            - Each row corresponds to a preprint-article pair.
            - Preprint columns retain their prefix (e.g., 'prpnt').
            - Article columns retain their prefix (e.g., 'article').
            - Index is reset for the entire DataFrame.
    """
    return pd.concat([
        pd.concat([preprint, article], axis=1)
        for preprint, article in samples
    ]).reset_index(drop=True)

def df_to_custom_struct(df: pd.DataFrame) -> List[List[pd.DataFrame]]:
    """
    Converts a DataFrame with prefixed columns (prpnt for preprint, article for article) 
    into a list of pairs of DataFrames.

    Args:
        df (pd.DataFrame): The input DataFrame with columns prefixed by `prpnt` and `article`.

    Returns:
        List[List[pd.DataFrame]]: A list of pairs of DataFrames [preprint, article].
    """
    # Split columns into preprint and article based on prefixes
    preprint_columns = [col for col in df.columns if col.startswith("prpnt")]
    article_columns = [col for col in df.columns if col.startswith("article")]

    # Separate the DataFrame into two DataFrames for preprint and article
    preprint_df = df[preprint_columns].copy()
    article_df = df[article_columns].copy()

    # Combine rows into pairs of DataFrames
    return [
        [preprint_df.iloc[[i]], article_df.iloc[[i]]]  # Use iloc to get each row as a DataFrame
        for i in range(len(df))
    ]