Spaces:

tmencatt
/

MatchPrePrintArticles

Sleeping

File size: 47,412 Bytes

b5cf002

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "from enum import Enum\n",
    "from typing import List, Dict, Any\n",
    "from dataclasses import dataclass\n",
    "from tqdm import tqdm\n",
    "\n",
    "import os\n",
    "import yaml\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "import pyalex\n",
    "from pyalex import Works\n",
    "from src.utils.io_utils import PROJECT_ROOT\n",
    "from src.dataset.Dataset import *"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Configurations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "@dataclass\n",
    "class ConfigAugmentation:\n",
    "    \"\"\"Configuration for OpenAlex features\"\"\"\n",
    "    basic: Dict[str, bool] = None  # id, doi, title, etc\n",
    "    source: Dict[str, bool] = None  # journal info\n",
    "    authors: Dict[str, bool] = None  # author details\n",
    "    metrics: Dict[str, bool] = None  # citations, fwci, etc\n",
    "    classification: Dict[str, bool] = None  # topics, concepts\n",
    "    access: Dict[str, bool] = None  # OA status\n",
    "    related_works: Dict[str, bool] = None  # references\n",
    "    abstract: bool = False"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Dataset Loading "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load the dataset \n",
    "\n",
    "class DatasetType(Enum):\n",
    "    FULL_RAW = \"full_raw\"\n",
    "    PARTIAL_RAW = \"partial_raw\"\n",
    "    FULL_AUGMENTED = \"full_augmented\"\n",
    "    PARTIAL_AUGMENTED = \"partial_augmented\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "@dataclass\n",
    "class Field:\n",
    "    \"\"\"Field configuration for data extraction\"\"\"\n",
    "    name: str\n",
    "    path: List[str]\n",
    "    default: Any = None\n",
    "\n",
    "class AlexFields:\n",
    "    \"\"\"OpenAlex field definitions\"\"\"\n",
    "    \n",
    "    BASIC = [\n",
    "        Field(\"id\", [\"id\"]),\n",
    "        Field(\"doi\", [\"doi\"]),\n",
    "        Field(\"title\", [\"title\"]),\n",
    "        Field(\"display_name\", [\"display_name\"]),\n",
    "        Field(\"publication_year\", [\"publication_year\"]),\n",
    "        Field(\"publication_date\", [\"publication_date\"]),\n",
    "        Field(\"language\", [\"language\"]),\n",
    "        Field(\"type\", [\"type\"]),\n",
    "        Field(\"type_crossref\", [\"type_crossref\"])\n",
    "    ]\n",
    "    \n",
    "    SOURCE = [\n",
    "        Field(\"journal_name\", [\"primary_location\", \"source\", \"display_name\"]),\n",
    "        Field(\"issn\", [\"primary_location\", \"source\", \"issn\"]),\n",
    "        Field(\"issn_l\", [\"primary_location\", \"source\", \"issn_l\"]),\n",
    "        Field(\"publisher\", [\"primary_location\", \"source\", \"host_organization_name\"]),\n",
    "        Field(\"type\", [\"primary_location\", \"source\", \"type\"])\n",
    "    ]\n",
    "\n",
    "    METRICS = [\n",
    "        Field(\"cited_by_count\", [\"cited_by_count\"]),\n",
    "        Field(\"cited_by_percentile\", [\"citation_normalized_percentile\"]),\n",
    "        Field(\"is_retracted\", [\"is_retracted\"]),\n",
    "        Field(\"fwci\", [\"fwci\"]),\n",
    "        Field(\"referenced_works_count\", [\"referenced_works_count\"])\n",
    "    ]\n",
    "\n",
    "    ACCESS = [\n",
    "        Field(\"is_oa\", [\"open_access\", \"is_oa\"]),\n",
    "        Field(\"oa_status\", [\"open_access\", \"oa_status\"]),\n",
    "        Field(\"oa_url\", [\"open_access\", \"oa_url\"]),\n",
    "        Field(\"pdf_url\", [\"primary_location\", \"pdf_url\"]),\n",
    "        Field(\"license\", [\"primary_location\", \"license\"]) \n",
    "    ]\n",
    "\n",
    "def get_nested_value(data: Dict, path: List[str], default: Any = None) -> Any:\n",
    "    \"\"\"Extract nested value from dictionary using path\"\"\"\n",
    "    value = data\n",
    "    for key in path:\n",
    "        try:\n",
    "            value = value[key]\n",
    "        except (KeyError, TypeError):\n",
    "            return default\n",
    "    return value\n",
    "\n",
    "class DataAugmenter:\n",
    "    \"\"\"Class for augmenting data with OpenAlex features\"\"\"\n",
    "\n",
    "    def __init__(self):\n",
    "        \"\"\"Initialize augmenter with API credentials\"\"\"\n",
    "        self.profile = self._load_profile()\n",
    "        self.email = self.profile[\"email\"]\n",
    "        self.filters = ConfigAugmentation(\n",
    "        basic={\n",
    "            \"id\": True,\n",
    "            \"doi\": True,\n",
    "            \"title\": True,\n",
    "            \"display_name\": True,\n",
    "            \"publication_year\": True,\n",
    "            \"publication_date\": True,\n",
    "            \"language\": True,\n",
    "            \"type\": True,\n",
    "            \"type_crossref\": True\n",
    "        },\n",
    "        source={\n",
    "            \"journal_name\": True,\n",
    "            \"issn\": True,\n",
    "            \"issn_l\": True,\n",
    "            \"publisher\": True,\n",
    "            \"type\": True\n",
    "        },\n",
    "        authors={\n",
    "            \"position\": True,\n",
    "            \"name\": True,\n",
    "            \"id\": True,\n",
    "            \"orcid\": True,\n",
    "            \"is_corresponding\": True,\n",
    "            \"affiliations\": False\n",
    "        },\n",
    "        metrics={\n",
    "            \"cited_by_count\": True,\n",
    "            \"cited_by_percentile\": False,\n",
    "            \"is_retracted\": True,\n",
    "            \"fwci\": True,\n",
    "            \"referenced_works_count\": True\n",
    "        },\n",
    "        classification={\n",
    "            \"primary_topic\": True,\n",
    "            \"topics\": False,\n",
    "            \"concepts\": False,\n",
    "        },\n",
    "        access={\n",
    "            \"is_oa\": True,\n",
    "            \"oa_status\": True,\n",
    "            \"oa_url\": True,\n",
    "            \"pdf_url\": True,\n",
    "            \"license\": True\n",
    "        },\n",
    "        related_works={\n",
    "            \"references\": True,\n",
    "            \"referenced_by_count\": True,\n",
    "            \"related\": True\n",
    "        },\n",
    "        abstract=True\n",
    "    )\n",
    "        \n",
    "        pyalex.config.email = self.email\n",
    "        \n",
    "    def _load_profile(self) -> Dict[str, str]:\n",
    "        \"\"\"Load API credentials from profile\"\"\"\n",
    "        profile_path = f\"{PROJECT_ROOT}/user_information/profile.yaml\"\n",
    "        \n",
    "        assert str(PROJECT_ROOT).split(\"/\")[-1] == \"MatchingPubs\", \"Please run this script in the github repo folder \"\n",
    "        assert os.path.exists(profile_path), \"create a profile.yaml with your email (email:) and your api key (api_key:). Go here to get one https://dev.elsevier.com/\"\n",
    "\n",
    "        \n",
    "        with open(profile_path, \"r\") as f:\n",
    "            profile = yaml.safe_load(f)\n",
    "            \n",
    "        return {\n",
    "            \"email\": profile[\"email\"]\n",
    "        }\n",
    "\n",
    "    def get_alex_features(self, doi: str) -> Dict:\n",
    "        \"\"\"Extract all OpenAlex features for a DOI\"\"\"\n",
    "        try:\n",
    "            work = Works()[f\"https://doi.org/{doi}\"]\n",
    "            result = {}\n",
    "\n",
    "            # Basic metadata\n",
    "            result[\"basic\"] = {\n",
    "                field.name: get_nested_value(work, field.path, None)\n",
    "                for field in AlexFields.BASIC\n",
    "            }\n",
    "            \n",
    "            # Source/journal info\n",
    "            result[\"source\"] = {\n",
    "                field.name: get_nested_value(work, field.path, None)\n",
    "                for field in AlexFields.SOURCE\n",
    "            }\n",
    "            \n",
    "            # Authors with affiliations\n",
    "            try:\n",
    "                result[\"authors\"] = [\n",
    "                    {\n",
    "                        \"position\": auth.get(\"author_position\", None),\n",
    "                        \"name\": auth.get(\"author\", {}).get(\"display_name\", None),\n",
    "                        \"id\": auth.get(\"author\", {}).get(\"id\", None),\n",
    "                        \"orcid\": auth.get(\"author\", {}).get(\"orcid\", None),\n",
    "                        \"is_corresponding\": auth.get(\"is_corresponding\", None),\n",
    "                        \"affiliations\": [\n",
    "                            {\n",
    "                                \"name\": inst.get(\"display_name\", None),\n",
    "                                \"id\": inst.get(\"id\", None),\n",
    "                                \"country\": inst.get(\"country_code\", None),\n",
    "                                \"type\": inst.get(\"type\", None),\n",
    "                                \"ror\": inst.get(\"ror\", None)\n",
    "                            }\n",
    "                            for inst in auth.get(\"institutions\", [])\n",
    "                        ]\n",
    "                    }\n",
    "                    for auth in work.get(\"authorships\", [])\n",
    "                ]\n",
    "            except:\n",
    "                result[\"authors\"] = None\n",
    "\n",
    "            # Topics and classifications  \n",
    "            try:\n",
    "                result[\"classification\"] = {\n",
    "                    \"primary_topic\": {\n",
    "                        \"name\": work.get(\"primary_topic\", {}).get(\"display_name\", None),\n",
    "                        \"score\": work.get(\"primary_topic\", {}).get(\"score\", None),\n",
    "                        \"field\": work.get(\"primary_topic\", {}).get(\"field\", {}).get(\"display_name\", None),\n",
    "                        \"subfield\": work.get(\"primary_topic\", {}).get(\"subfield\", {}).get(\"display_name\", None)\n",
    "                    },\n",
    "                    \"topics\": [\n",
    "                        {\n",
    "                            \"name\": topic.get(\"display_name\", None),\n",
    "                            \"score\": topic.get(\"score\", None),\n",
    "                            \"field\": topic.get(\"field\", {}).get(\"display_name\", None)\n",
    "                        }\n",
    "                        for topic in work.get(\"topics\", [])\n",
    "                    ],\n",
    "                    \"concepts\": [\n",
    "                        {\n",
    "                            \"name\": concept.get(\"display_name\", None),\n",
    "                            \"level\": concept.get(\"level\", None),\n",
    "                            \"score\": concept.get(\"score\", None),\n",
    "                            \"wikidata\": concept.get(\"wikidata\", None)\n",
    "                        }\n",
    "                        for concept in work.get(\"concepts\", [])\n",
    "                    ]\n",
    "                }\n",
    "            except:\n",
    "                result[\"classification\"] = None\n",
    "\n",
    "            # Metrics\n",
    "            result[\"metrics\"] = {\n",
    "                field.name: get_nested_value(work, field.path, None)\n",
    "                for field in AlexFields.METRICS\n",
    "            }\n",
    "\n",
    "            # Access info\n",
    "            result[\"access\"] = {\n",
    "                field.name: get_nested_value(work, field.path, None)\n",
    "                for field in AlexFields.ACCESS\n",
    "            }\n",
    "\n",
    "            # Abstract\n",
    "            try:\n",
    "                if \"abstract_inverted_index\" in work:\n",
    "                    abstract_dict = work[\"abstract_inverted_index\"]\n",
    "                    if abstract_dict:\n",
    "                        max_pos = max(max(positions) for positions in abstract_dict.values())\n",
    "                        words = [\"\"] * (max_pos + 1)\n",
    "                        for word, positions in abstract_dict.items():\n",
    "                            for pos in positions:\n",
    "                                words[pos] = word\n",
    "                        result[\"abstract\"] = \" \".join(words)\n",
    "                    else:\n",
    "                        result[\"abstract\"] = None\n",
    "                else:\n",
    "                    result[\"abstract\"] = None\n",
    "            except:\n",
    "                result[\"abstract\"] = None\n",
    "\n",
    "            return result\n",
    "\n",
    "        except Exception as e:\n",
    "            print(f\"OpenAlex error for DOI {doi}: {e}\")\n",
    "            return {}\n",
    "    \n",
    "    def filter_augmented_data(self, data: Dict[str, Any], config: ConfigAugmentation = None) -> Dict[str, Any]:\n",
    "        \"\"\"Filter data based on configuration\n",
    "        \n",
    "        Args:\n",
    "            data: Dictionary containing raw data\n",
    "            config: Configuration specifying which features to include\n",
    "            \n",
    "        Returns:\n",
    "            Filtered dictionary containing only the configured features\n",
    "        \"\"\"\n",
    "        config = config or self.filters\n",
    "        \n",
    "        def filter_section(section_data: Dict[str, Any], section_config: Dict[str, bool]) -> Dict[str, Any]:\n",
    "            \"\"\"Filter a section of the data based on the section configuration\"\"\"\n",
    "            return {k: v for k, v in section_data.items() if k in section_config and section_config[k]}\n",
    "        \n",
    "        filtered_data = {}\n",
    "        \n",
    "        # Filter OpenAlex data\n",
    "        alex_filtered = {}\n",
    "        \n",
    "        # Basic metadata\n",
    "        if config.basic:\n",
    "            alex_filtered[\"basic\"] = filter_section(data.get(\"basic\", {}), config.basic)\n",
    "        \n",
    "        # Source/journal info\n",
    "        if config.source:\n",
    "            alex_filtered[\"source\"] = filter_section(data.get(\"source\", {}), config.source)\n",
    "        \n",
    "        # Authors\n",
    "        if config.authors:\n",
    "            authors_data = data.get(\"authors\", [])\n",
    "            filtered_authors = []\n",
    "            for author in authors_data:\n",
    "                filtered_author = filter_section(author, config.authors)\n",
    "                if config.authors.get(\"affiliations\", False):\n",
    "                    print(author.get(\"affiliations\", []))\n",
    "                    filtered_author[\"affiliations\"] = [\n",
    "                        filter_section(aff, config.authors[\"affiliations\"])\n",
    "                        for aff in author.get(\"affiliations\", [])\n",
    "                    ]\n",
    "                filtered_authors.append(filtered_author)\n",
    "            alex_filtered[\"authors\"] = filtered_authors\n",
    "        \n",
    "        # Metrics\n",
    "        if config.metrics:\n",
    "            alex_filtered[\"metrics\"] = filter_section(data.get(\"metrics\", {}), config.metrics)\n",
    "        \n",
    "        # Classification\n",
    "        if config.classification:\n",
    "            classification_data = data.get(\"classification\", {})\n",
    "            alex_filtered[\"classification\"] = {\n",
    "                k: v for k, v in classification_data.items() if k in config.classification and config.classification[k]\n",
    "            }\n",
    "        \n",
    "        # Access info\n",
    "        if config.access:\n",
    "            alex_filtered[\"access\"] = filter_section(data.get(\"access\", {}), config.access)\n",
    "        \n",
    "        # Related works\n",
    "        if config.related_works:\n",
    "            alex_filtered[\"related_works\"] = filter_section(data.get(\"related_works\", {}), config.related_works)\n",
    "        \n",
    "        # Abstract\n",
    "        if config.abstract and \"abstract\" in data:\n",
    "            alex_filtered[\"abstract\"] = data[\"abstract\"]\n",
    "        \n",
    "        filtered_data = alex_filtered\n",
    "    \n",
    "        return filtered_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'primary_topic': {'name': 'Gait Analysis and Fall Prevention in Elderly',\n",
       "  'score': 0.9994,\n",
       "  'field': 'Health Professions',\n",
       "  'subfield': 'Physical Therapy, Sports Therapy and Rehabilitation'}}"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "doi = \"10.2196/41082\"\n",
    "a = DataAugmenter()\n",
    "info = a.get_alex_features(doi)\n",
    "filtered_info = a.filter_augmented_data(info)\n",
    "filtered_info[\"classification\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "class FullAugmentedDataset: \n",
    "\n",
    "    def __init__(self):\n",
    "        self.augmenter = DataAugmenter()\n",
    "        self.full_raw_dataset = self._load_the_dataset()\n",
    "\n",
    "    def _load_the_dataset(self, type: DatasetType = DatasetType.FULL_RAW) -> pd.DataFrame:\n",
    "        \"\"\"Load as csv file one of the datasets for training.\"\"\"\n",
    "        assert str(PROJECT_ROOT).split(\"/\")[-1] == \"MatchingPubs\", \"Please run this script in the github repo folder \"\n",
    "        \n",
    "        if type == DatasetType.FULL_RAW:\n",
    "            return pd.read_csv(f\"{PROJECT_ROOT}/data/crossref-preprint-article-relationships-Aug-2023.csv\")\n",
    "\n",
    "    def retrieve_dois_couple(self, len: int = 1, random: bool = False, seed: bool = None, full: bool = False):\n",
    "        \"\"\"Retrieve two DOIs from the dataset\"\"\"\n",
    "        if random:\n",
    "            dois = self.full_raw_dataset.sample(n=len, random_state=seed)[[\"preprint_doi\", \"article_doi\"]]\n",
    "        else:\n",
    "            dois = self.full_raw_dataset.head(len)[[\"preprint_doi\", \"article_doi\"]]\n",
    "        if full:\n",
    "            dois = self.full_raw_dataset[[\"preprint_doi\", \"article_doi\"]]\n",
    "        return dois.to_numpy()\n",
    "    \n",
    "    @staticmethod\n",
    "    def _flatten_list(lst):\n",
    "        \"\"\"\n",
    "        Flattens a nested list into a single list. If the input is not nested, it returns the original list.\n",
    "        Handles cases where some elements are lists and others are not.\n",
    "        \"\"\"\n",
    "        if not isinstance(lst, list):  # Ensure the input is a list\n",
    "            raise ValueError(\"Input must be a list\")\n",
    "\n",
    "        def _flatten(sublist):\n",
    "            for item in sublist:\n",
    "                if isinstance(item, list):  # Check if the item is a list\n",
    "                    yield from _flatten(item)  # Recursively flatten the list\n",
    "                else:\n",
    "                    yield item  # Yield the non-list item\n",
    "\n",
    "        return list(_flatten(lst))\n",
    "    \n",
    "    def _augmented_data_to_row(self, filtered_data: Dict[str, Any], preprint: bool = True) -> pd.Series:\n",
    "        \"\"\"Transform filtered augmented data into a pandas Series\n",
    "        \n",
    "        Args:\n",
    "            filtered_data: Dictionary containing filtered OpenAlex and Elsevier data\n",
    "            preprint: If True, use prpnt_ prefix, else use article_ prefix\n",
    "            \n",
    "        Returns:\n",
    "            pd.Series: Flattened data as a single row\n",
    "        \"\"\"\n",
    "\n",
    "        additional_part = FullAugmentedDataset.filter_author(filtered_data.get(\"authors\",{}))\n",
    "        # modify the key of additional part by adding authors_ at the beginning\n",
    "        additional_part = {f\"authors_{k}\": v for k, v in additional_part.items()} \n",
    "        # remove authos key from filtreed_info\n",
    "        filtered_data.pop(\"authors\")\n",
    "        # append the additional part to the filtered_info\n",
    "        filtered_data.update(additional_part)\n",
    "        final_dictionary = FullAugmentedDataset.flatten_dict(filtered_data, preprint=preprint)\n",
    "\n",
    "        for k, v in final_dictionary.items():\n",
    "            final_dictionary[k] = \"$@$\".join(map(str, FullAugmentedDataset._flatten_list(v))) if isinstance(v, list) else [v]\n",
    "\n",
    "        return pd.DataFrame(final_dictionary)\n",
    "\n",
    "    @staticmethod\n",
    "    def filter_author(authors_info : list) -> dict:\n",
    "\n",
    "        try:\n",
    "            relevant_keys = authors_info[0].keys()\n",
    "            new_dict = {}\n",
    "            for key in relevant_keys:\n",
    "                new_dict[key] = [author[key] for author in authors_info]\n",
    "            return new_dict\n",
    "        except:\n",
    "            return {}\n",
    "    \n",
    "    @staticmethod\n",
    "    def flatten_dict(d: dict, parent_key: str = '', sep: str = '_', preprint = True) -> dict:\n",
    "        \"\"\"Flatten a nested dictionary.\n",
    "        \n",
    "        Args:\n",
    "            d (dict): The dictionary to flatten.\n",
    "            parent_key (str): The base key string to use for the flattened keys.\n",
    "            sep (str): The separator to use between parent and child keys.\n",
    "            \n",
    "        Returns:\n",
    "            dict: The flattened dictionary.\n",
    "        \"\"\"\n",
    "        addition = \"prpnt_\" if preprint else \"article_\"\n",
    "        def _flatten_dict(d: dict, parent_key: str = '', sep: str = '_') -> dict:\n",
    "            items = []\n",
    "            for k, v in d.items():\n",
    "                new_key = f\"{parent_key}{sep}{k}\" if parent_key else k\n",
    "                if isinstance(v, dict):\n",
    "                    items.extend(_flatten_dict(v, new_key, sep=sep).items())\n",
    "                else:\n",
    "                    items.append((new_key, v))\n",
    "            return dict(items)\n",
    "        return {f\"{addition}{k}\": v for k, v in _flatten_dict(d, parent_key, sep).items()}\n",
    "\n",
    "    def process_pair(self, dois) -> pd.DataFrame:\n",
    "        \"\"\"Process a pair of DOIs and return combined rows as a DataFrame\"\"\"\n",
    "        assert len(dois) > 0\n",
    "        rows = []\n",
    "        for preprint_doi, article_doi in tqdm(dois):\n",
    "            # Get preprint features\n",
    "            preprint_features = self.augmenter.get_alex_features(preprint_doi) # augment with all the features\n",
    "            preprint_filtered = self.augmenter.filter_augmented_data(preprint_features) # filter the relevant features\n",
    "            preprint_row = self._augmented_data_to_row(preprint_filtered, True)\n",
    "\n",
    "            # Get article features\n",
    "            article_features = self.augmenter.get_alex_features(article_doi) # augment with all the features\n",
    "            article_filtered = self.augmenter.filter_augmented_data(article_features)\n",
    "            article_row = self._augmented_data_to_row(article_filtered, False)\n",
    "\n",
    "            # print(article_row.columns)\n",
    "            # print(len(preprint_row.columns))\n",
    "\n",
    "            # combined_row = pd.concat([preprint_row, article_row], axis=1)\n",
    "            # rows.append(combined_row)\n",
    "            rows.append([preprint_row, article_row])\n",
    "\n",
    "        return rows\n",
    "\n",
    "    @staticmethod\n",
    "    def transform_array(input_array, factor):\n",
    "        output_list = []\n",
    "        \n",
    "        for i, row in enumerate(input_array):\n",
    "            other_indices = np.array([j for j in range(len(input_array)) if j != i])\n",
    "            sampled_indices = np.random.choice(other_indices, size=factor, replace=False)\n",
    "            sampled_rows = [input_array[j] for j in sampled_indices]\n",
    "\n",
    "            output_list.append(pd.concat([row[0], row[1], pd.DataFrame(data=[1], columns=['label'])], axis=1))\n",
    "            for B in sampled_rows:\n",
    "                output_list.append(pd.concat([row[0], B[1], pd.DataFrame(data=[0], columns=['label'])], axis=1))\n",
    "\n",
    "        return pd.concat(output_list).reset_index(drop=True)\n",
    "\n",
    "    def get_full_dataset(self, len: int = 1, random: bool = True, seed: int = 42, full: bool = True) -> pd.DataFrame:\n",
    "        \"\"\"Process all DOI pairs and return full dataset\"\"\"\n",
    "        dois = self.retrieve_dois_couple(len, random, seed, full)\n",
    "        self.augmented_df = FullAugmentedDataset.transform_array(self.process_pair(dois), factor=3)\n",
    "        return self.augmented_df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# TRYING STUFF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create dataset with new configs\n",
    "dataset = FullAugmentedDataset()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(5, 2)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dois = dataset.retrieve_dois_couple(5, random = True, seed = 42)\n",
    "dois.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "info = dataset.augmenter.get_alex_features(dois[0][0]) # augment with all the features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'basic': {'id': 'https://openalex.org/W4213260597',\n",
       "  'doi': 'https://doi.org/10.31234/osf.io/6fps2',\n",
       "  'title': 'Distance perception in virtual reality: A meta-analysis of the effect of head-mounted display characteristics',\n",
       "  'display_name': 'Distance perception in virtual reality: A meta-analysis of the effect of head-mounted display characteristics',\n",
       "  'publication_year': 2022,\n",
       "  'publication_date': '2022-02-12',\n",
       "  'language': 'en',\n",
       "  'type': 'preprint',\n",
       "  'type_crossref': 'posted-content'},\n",
       " 'source': {'journal_name': None,\n",
       "  'issn': None,\n",
       "  'issn_l': None,\n",
       "  'publisher': None,\n",
       "  'type': None},\n",
       " 'authors': [{'position': 'first',\n",
       "   'name': 'Jonathan W. Kelly',\n",
       "   'id': 'https://openalex.org/A5011931977',\n",
       "   'orcid': 'https://orcid.org/0000-0002-4317-273X',\n",
       "   'is_corresponding': True,\n",
       "   'affiliations': [{'name': 'Iowa State University',\n",
       "     'id': 'https://openalex.org/I173911158',\n",
       "     'country': 'US',\n",
       "     'type': 'education',\n",
       "     'ror': 'https://ror.org/04rswrd78'}]}],\n",
       " 'classification': {'primary_topic': {'name': 'Virtual Presence and Embodiment in VR Research',\n",
       "   'score': 0.9982,\n",
       "   'field': 'Computer Science',\n",
       "   'subfield': 'Human-Computer Interaction'},\n",
       "  'topics': [{'name': 'Virtual Presence and Embodiment in VR Research',\n",
       "    'score': 0.9982,\n",
       "    'field': 'Computer Science'},\n",
       "   {'name': 'Neural Mechanisms of Visual Perception and Processing',\n",
       "    'score': 0.9906,\n",
       "    'field': 'Neuroscience'},\n",
       "   {'name': 'Spatial Ability for STEM Domains',\n",
       "    'score': 0.9727,\n",
       "    'field': 'Engineering'}],\n",
       "  'concepts': [{'name': 'Virtual reality',\n",
       "    'level': 2,\n",
       "    'score': 0.74525213,\n",
       "    'wikidata': 'https://www.wikidata.org/wiki/Q170519'},\n",
       "   {'name': 'Perception',\n",
       "    'level': 2,\n",
       "    'score': 0.69497585,\n",
       "    'wikidata': 'https://www.wikidata.org/wiki/Q160402'},\n",
       "   {'name': 'Optical head-mounted display',\n",
       "    'level': 2,\n",
       "    'score': 0.64143133,\n",
       "    'wikidata': 'https://www.wikidata.org/wiki/Q17105103'},\n",
       "   {'name': 'Computer science',\n",
       "    'level': 0,\n",
       "    'score': 0.4773505,\n",
       "    'wikidata': 'https://www.wikidata.org/wiki/Q21198'},\n",
       "   {'name': 'Psychology',\n",
       "    'level': 0,\n",
       "    'score': 0.3757282,\n",
       "    'wikidata': 'https://www.wikidata.org/wiki/Q9418'},\n",
       "   {'name': 'Computer vision',\n",
       "    'level': 1,\n",
       "    'score': 0.3722988,\n",
       "    'wikidata': 'https://www.wikidata.org/wiki/Q844240'},\n",
       "   {'name': 'Artificial intelligence',\n",
       "    'level': 1,\n",
       "    'score': 0.35102686,\n",
       "    'wikidata': 'https://www.wikidata.org/wiki/Q11660'},\n",
       "   {'name': 'Neuroscience',\n",
       "    'level': 1,\n",
       "    'score': 0.0,\n",
       "    'wikidata': 'https://www.wikidata.org/wiki/Q207011'}]},\n",
       " 'metrics': {'cited_by_count': 6,\n",
       "  'cited_by_percentile': {'value': 0.997093,\n",
       "   'is_in_top_1_percent': True,\n",
       "   'is_in_top_10_percent': True},\n",
       "  'is_retracted': False,\n",
       "  'fwci': None,\n",
       "  'referenced_works_count': 89},\n",
       " 'access': {'is_oa': True,\n",
       "  'oa_status': 'green',\n",
       "  'oa_url': 'https://psyarxiv.com/6fps2/download',\n",
       "  'pdf_url': 'https://psyarxiv.com/6fps2/download',\n",
       "  'license': None},\n",
       " 'abstract': 'Distances are commonly underperceived in virtual reality (VR), and this finding has been documented repeatedly over more than two decades of research. Yet, there is evidence that perceived distance is more accurate in modern compared to older head-mounted displays (HMDs). This meta-analysis of 131 studies describes egocentric distance perception across 20 HMDs, and also examines the relationship between perceived distance and technical HMD characteristics. Judged distance was positively associated with HMD field of view (FOV), positively associated with HMD resolution, and negatively associated with HMD weight. The effects of FOV and resolution were more pronounced among heavier HMDs. These findings suggest that future improvements in these technical characteristics may be central to resolving the problem of distance underperception in VR.'}"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "info"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "filtered_info = dataset.augmenter.filter_augmented_data(info)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>prpnt_basic_id</th>\n",
       "      <th>prpnt_basic_doi</th>\n",
       "      <th>prpnt_basic_title</th>\n",
       "      <th>prpnt_basic_display_name</th>\n",
       "      <th>prpnt_basic_publication_year</th>\n",
       "      <th>prpnt_basic_publication_date</th>\n",
       "      <th>prpnt_basic_language</th>\n",
       "      <th>prpnt_basic_type</th>\n",
       "      <th>prpnt_basic_type_crossref</th>\n",
       "      <th>prpnt_source_journal_name</th>\n",
       "      <th>...</th>\n",
       "      <th>prpnt_access_oa_status</th>\n",
       "      <th>prpnt_access_oa_url</th>\n",
       "      <th>prpnt_access_pdf_url</th>\n",
       "      <th>prpnt_access_license</th>\n",
       "      <th>prpnt_abstract</th>\n",
       "      <th>prpnt_authors_position</th>\n",
       "      <th>prpnt_authors_name</th>\n",
       "      <th>prpnt_authors_id</th>\n",
       "      <th>prpnt_authors_orcid</th>\n",
       "      <th>prpnt_authors_is_corresponding</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>https://openalex.org/W4213260597</td>\n",
       "      <td>https://doi.org/10.31234/osf.io/6fps2</td>\n",
       "      <td>Distance perception in virtual reality: A meta...</td>\n",
       "      <td>Distance perception in virtual reality: A meta...</td>\n",
       "      <td>2022</td>\n",
       "      <td>2022-02-12</td>\n",
       "      <td>en</td>\n",
       "      <td>preprint</td>\n",
       "      <td>posted-content</td>\n",
       "      <td>None</td>\n",
       "      <td>...</td>\n",
       "      <td>green</td>\n",
       "      <td>https://psyarxiv.com/6fps2/download</td>\n",
       "      <td>https://psyarxiv.com/6fps2/download</td>\n",
       "      <td>None</td>\n",
       "      <td>Distances are commonly underperceived in virtu...</td>\n",
       "      <td>first</td>\n",
       "      <td>Jonathan W. Kelly</td>\n",
       "      <td>https://openalex.org/A5011931977</td>\n",
       "      <td>https://orcid.org/0000-0002-4317-273X</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1 rows × 33 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                     prpnt_basic_id                        prpnt_basic_doi  \\\n",
       "0  https://openalex.org/W4213260597  https://doi.org/10.31234/osf.io/6fps2   \n",
       "\n",
       "                                   prpnt_basic_title  \\\n",
       "0  Distance perception in virtual reality: A meta...   \n",
       "\n",
       "                            prpnt_basic_display_name  \\\n",
       "0  Distance perception in virtual reality: A meta...   \n",
       "\n",
       "   prpnt_basic_publication_year prpnt_basic_publication_date  \\\n",
       "0                          2022                   2022-02-12   \n",
       "\n",
       "  prpnt_basic_language prpnt_basic_type prpnt_basic_type_crossref  \\\n",
       "0                   en         preprint            posted-content   \n",
       "\n",
       "  prpnt_source_journal_name  ... prpnt_access_oa_status  \\\n",
       "0                      None  ...                  green   \n",
       "\n",
       "                   prpnt_access_oa_url                 prpnt_access_pdf_url  \\\n",
       "0  https://psyarxiv.com/6fps2/download  https://psyarxiv.com/6fps2/download   \n",
       "\n",
       "  prpnt_access_license                                     prpnt_abstract  \\\n",
       "0                 None  Distances are commonly underperceived in virtu...   \n",
       "\n",
       "   prpnt_authors_position prpnt_authors_name  \\\n",
       "0                   first  Jonathan W. Kelly   \n",
       "\n",
       "                   prpnt_authors_id                    prpnt_authors_orcid  \\\n",
       "0  https://openalex.org/A5011931977  https://orcid.org/0000-0002-4317-273X   \n",
       "\n",
       "   prpnt_authors_is_corresponding  \n",
       "0                            True  \n",
       "\n",
       "[1 rows x 33 columns]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "row = dataset._augmented_data_to_row(filtered_info)\n",
    "row"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 5/5 [00:04<00:00,  1.02it/s]\n",
      "/var/folders/kp/b80wd80s53l95yjb77jn_l0r0000gn/T/ipykernel_17064/485421214.py:140: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
      "  return pd.concat(output_list).reset_index(drop=True)\n"
     ]
    }
   ],
   "source": [
    "df = dataset.get_full_dataset(5, full=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[['10.31234/osf.io/6fps2' '10.1109/tvcg.2022.3196606']\n",
      " ['10.5194/acpd-11-3071-2011' '10.5194/acp-12-3837-2012']\n",
      " ['10.1101/2020.08.07.241687' '10.1021/acscentsci.1c00703']\n",
      " ['10.21203/rs.3.rs-62250/v1' '10.1016/j.vetpar.2021.109373']\n",
      " ['10.21203/rs.3.rs-2640242/v1' '10.1007/s10499-023-01047-1']]\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>prpnt_basic_doi</th>\n",
       "      <th>article_basic_doi</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>https://doi.org/10.31234/osf.io/6fps2</td>\n",
       "      <td>https://doi.org/10.1109/tvcg.2022.3196606</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>https://doi.org/10.31234/osf.io/6fps2</td>\n",
       "      <td>https://doi.org/10.1007/s10499-023-01047-1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>https://doi.org/10.31234/osf.io/6fps2</td>\n",
       "      <td>https://doi.org/10.5194/acp-12-3837-2012</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>https://doi.org/10.31234/osf.io/6fps2</td>\n",
       "      <td>https://doi.org/10.1016/j.vetpar.2021.109373</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>https://doi.org/10.5194/acpd-11-3071-2011</td>\n",
       "      <td>https://doi.org/10.5194/acp-12-3837-2012</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>https://doi.org/10.5194/acpd-11-3071-2011</td>\n",
       "      <td>https://doi.org/10.1016/j.vetpar.2021.109373</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>https://doi.org/10.5194/acpd-11-3071-2011</td>\n",
       "      <td>https://doi.org/10.1109/tvcg.2022.3196606</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>https://doi.org/10.5194/acpd-11-3071-2011</td>\n",
       "      <td>https://doi.org/10.1007/s10499-023-01047-1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>https://doi.org/10.1101/2020.08.07.241687</td>\n",
       "      <td>https://doi.org/10.1021/acscentsci.1c00703</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>https://doi.org/10.1101/2020.08.07.241687</td>\n",
       "      <td>https://doi.org/10.1007/s10499-023-01047-1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>https://doi.org/10.1101/2020.08.07.241687</td>\n",
       "      <td>https://doi.org/10.1109/tvcg.2022.3196606</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>https://doi.org/10.1101/2020.08.07.241687</td>\n",
       "      <td>https://doi.org/10.1016/j.vetpar.2021.109373</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>https://doi.org/10.21203/rs.3.rs-62250/v1</td>\n",
       "      <td>https://doi.org/10.1016/j.vetpar.2021.109373</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>https://doi.org/10.21203/rs.3.rs-62250/v1</td>\n",
       "      <td>https://doi.org/10.5194/acp-12-3837-2012</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>https://doi.org/10.21203/rs.3.rs-62250/v1</td>\n",
       "      <td>https://doi.org/10.1109/tvcg.2022.3196606</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>https://doi.org/10.21203/rs.3.rs-62250/v1</td>\n",
       "      <td>https://doi.org/10.1021/acscentsci.1c00703</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>https://doi.org/10.21203/rs.3.rs-2640242/v1</td>\n",
       "      <td>https://doi.org/10.1007/s10499-023-01047-1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>https://doi.org/10.21203/rs.3.rs-2640242/v1</td>\n",
       "      <td>https://doi.org/10.5194/acp-12-3837-2012</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>https://doi.org/10.21203/rs.3.rs-2640242/v1</td>\n",
       "      <td>https://doi.org/10.1016/j.vetpar.2021.109373</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>https://doi.org/10.21203/rs.3.rs-2640242/v1</td>\n",
       "      <td>https://doi.org/10.1021/acscentsci.1c00703</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                prpnt_basic_doi  \\\n",
       "0         https://doi.org/10.31234/osf.io/6fps2   \n",
       "1         https://doi.org/10.31234/osf.io/6fps2   \n",
       "2         https://doi.org/10.31234/osf.io/6fps2   \n",
       "3         https://doi.org/10.31234/osf.io/6fps2   \n",
       "4     https://doi.org/10.5194/acpd-11-3071-2011   \n",
       "5     https://doi.org/10.5194/acpd-11-3071-2011   \n",
       "6     https://doi.org/10.5194/acpd-11-3071-2011   \n",
       "7     https://doi.org/10.5194/acpd-11-3071-2011   \n",
       "8     https://doi.org/10.1101/2020.08.07.241687   \n",
       "9     https://doi.org/10.1101/2020.08.07.241687   \n",
       "10    https://doi.org/10.1101/2020.08.07.241687   \n",
       "11    https://doi.org/10.1101/2020.08.07.241687   \n",
       "12    https://doi.org/10.21203/rs.3.rs-62250/v1   \n",
       "13    https://doi.org/10.21203/rs.3.rs-62250/v1   \n",
       "14    https://doi.org/10.21203/rs.3.rs-62250/v1   \n",
       "15    https://doi.org/10.21203/rs.3.rs-62250/v1   \n",
       "16  https://doi.org/10.21203/rs.3.rs-2640242/v1   \n",
       "17  https://doi.org/10.21203/rs.3.rs-2640242/v1   \n",
       "18  https://doi.org/10.21203/rs.3.rs-2640242/v1   \n",
       "19  https://doi.org/10.21203/rs.3.rs-2640242/v1   \n",
       "\n",
       "                               article_basic_doi  label  \n",
       "0      https://doi.org/10.1109/tvcg.2022.3196606      1  \n",
       "1     https://doi.org/10.1007/s10499-023-01047-1      0  \n",
       "2       https://doi.org/10.5194/acp-12-3837-2012      0  \n",
       "3   https://doi.org/10.1016/j.vetpar.2021.109373      0  \n",
       "4       https://doi.org/10.5194/acp-12-3837-2012      1  \n",
       "5   https://doi.org/10.1016/j.vetpar.2021.109373      0  \n",
       "6      https://doi.org/10.1109/tvcg.2022.3196606      0  \n",
       "7     https://doi.org/10.1007/s10499-023-01047-1      0  \n",
       "8     https://doi.org/10.1021/acscentsci.1c00703      1  \n",
       "9     https://doi.org/10.1007/s10499-023-01047-1      0  \n",
       "10     https://doi.org/10.1109/tvcg.2022.3196606      0  \n",
       "11  https://doi.org/10.1016/j.vetpar.2021.109373      0  \n",
       "12  https://doi.org/10.1016/j.vetpar.2021.109373      1  \n",
       "13      https://doi.org/10.5194/acp-12-3837-2012      0  \n",
       "14     https://doi.org/10.1109/tvcg.2022.3196606      0  \n",
       "15    https://doi.org/10.1021/acscentsci.1c00703      0  \n",
       "16    https://doi.org/10.1007/s10499-023-01047-1      1  \n",
       "17      https://doi.org/10.5194/acp-12-3837-2012      0  \n",
       "18  https://doi.org/10.1016/j.vetpar.2021.109373      0  \n",
       "19    https://doi.org/10.1021/acscentsci.1c00703      0  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "print(dois)\n",
    "display(df[['prpnt_basic_doi', 'article_basic_doi', 'label']])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "dl",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}