Spaces:

tmencatt
/

MatchPrePrintArticles

Sleeping

App Files Files Community

MatchPrePrintArticles / src /utils /struct_utils.py

tmencatt

app

b5cf002 4 months ago

raw

history blame contribute delete

3.78 kB

	def flatten_list(lst):
	"""
	Flattens a nested list into a single list. If the input is not nested, it returns the original list.
	Handles cases where some elements are lists and others are not.
	"""
	if not isinstance(lst, list):
	raise ValueError("You must provide a valid list")

	def _flatten(sublist):
	for item in sublist:
	if isinstance(item, list):
	yield from _flatten(item)
	else:
	yield item

	return list(_flatten(lst))

	def flatten_dict(d: dict, parent_key: str = '', sep: str = '_') -> dict:
	"""Flatten a nested dictionary efficiently.

	Args:
	d (dict): The dictionary to flatten.
	parent_key (str): The base key string to use for the flattened keys.
	sep (str): The separator to use between parent and child keys.

	Returns:
	dict: The flattened dictionary.
	"""
	if not isinstance(d, dict):
	raise ValueError("You must provide a valid dictionary.")

	def _flatten(d, parent_key):
	for k, v in d.items():
	new_key = f"{parent_key}{sep}{k}" if parent_key else k
	if isinstance(v, dict):
	yield from _flatten(v, new_key)
	else:
	yield new_key, v

	return dict(_flatten(d, parent_key))


	def filter_dict_by_keys(original_dict, relevant_keys):
	"""
	Filters a dictionary to include only the key-value pairs where the key is in relevant_keys.

	Args:
	original_dict (dict): The dictionary to filter.
	relevant_keys (set): The set of keys to keep.

	Returns:
	dict: A filtered dictionary containing only the relevant key-value pairs.
	"""
	return {key: original_dict[key] for key in relevant_keys if key in original_dict}


	from typing import List
	import pandas as pd

	def custom_struct_to_df(samples: List[List[pd.DataFrame]]):
	"""
	Converts a custom data structure (a list of pairs of DataFrames) into a single consolidated DataFrame.

	Args:
	samples (List[List[pd.DataFrame]]): A list of pairs of DataFrames. Each pair consists of:
	- A preprint DataFrame (e.g., containing information about preprints).
	- An article DataFrame (e.g., containing information about corresponding articles).

	Returns:
	pd.DataFrame: A single DataFrame where:
	- Each row corresponds to a preprint-article pair.
	- Preprint columns retain their prefix (e.g., 'prpnt').
	- Article columns retain their prefix (e.g., 'article').
	- Index is reset for the entire DataFrame.
	"""
	return pd.concat([
	pd.concat([preprint, article], axis=1)
	for preprint, article in samples
	]).reset_index(drop=True)

	def df_to_custom_struct(df: pd.DataFrame) -> List[List[pd.DataFrame]]:
	"""
	Converts a DataFrame with prefixed columns (prpnt for preprint, article for article)
	into a list of pairs of DataFrames.

	Args:
	df (pd.DataFrame): The input DataFrame with columns prefixed by `prpnt` and `article`.

	Returns:
	List[List[pd.DataFrame]]: A list of pairs of DataFrames [preprint, article].
	"""
	# Split columns into preprint and article based on prefixes
	preprint_columns = [col for col in df.columns if col.startswith("prpnt")]
	article_columns = [col for col in df.columns if col.startswith("article")]

	# Separate the DataFrame into two DataFrames for preprint and article
	preprint_df = df[preprint_columns].copy()
	article_df = df[article_columns].copy()

	# Combine rows into pairs of DataFrames
	return [
	[preprint_df.iloc[[i]], article_df.iloc[[i]]] # Use iloc to get each row as a DataFrame
	for i in range(len(df))
	]