Spaces:

tmencatt
/

MatchPrePrintArticles

Build error

File size: 13,415 Bytes

b5cf002

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/giorgosnikolaou/Library/Python/3.9/lib/python/site-packages/urllib3/__init__.py:35: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020\n",
      "  warnings.warn(\n",
      "[nltk_data] Downloading package words to\n",
      "[nltk_data]     /Users/giorgosnikolaou/nltk_data...\n",
      "[nltk_data]   Package words is already up-to-date!\n"
     ]
    }
   ],
   "source": [
    "import gradio as gr\n",
    "import pandas as pd\n",
    "import pandas as pd\n",
    "from src.utils.io_utils import PROJECT_ROOT\n",
    "from run_augmenter import negative_sampler  , positive_sampler\n",
    "from pathlib import Path\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Running on local URL:  http://127.0.0.1:7860\n",
      "Running on public URL: https://85b886469a8c17104c.gradio.live\n",
      "\n",
      "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div><iframe src=\"https://85b886469a8c17104c.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "random\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Negative Sampling: 100%|██████████| 100/100 [00:11<00:00,  8.43it/s]\n"
     ]
    }
   ],
   "source": [
    "\n",
    "def augment_interface(factor, type_or_difficulty, use_default, csv_file=None):\n",
    "    \"\"\"Negative Tool Sampler: Wrapper to handle negative dataset augmentation.\"\"\"\n",
    "    try:\n",
    "        if use_default:\n",
    "            input_csv_path = f\"{PROJECT_ROOT}/data/crossref-preprint-article-relationships-Aug-2023.csv\"\n",
    "            if not Path(input_csv_path).exists():\n",
    "                return \"Error: Default CSV file not found!\", None, gr.update(visible=False)\n",
    "        elif csv_file is not None:\n",
    "            input_csv_path = csv_file.name\n",
    "        else:\n",
    "            return \"Error: Please select default or upload a CSV file.\", None, gr.update(visible=False)\n",
    "\n",
    "        augmented_df = negative_sampler(input_csv_path, factor, type_or_difficulty)\n",
    "        output_csv_path = \"augmented_dataset.csv\"\n",
    "        augmented_df.to_csv(output_csv_path, index=False)\n",
    "\n",
    "        return output_csv_path, augmented_df.head(), gr.update(visible=True)\n",
    "\n",
    "    except Exception as e:\n",
    "        return f\"Error during processing: {str(e)}\", None, gr.update(visible=False)\n",
    "\n",
    "\n",
    "def positive_sampler_interface(use_default, csv_file=None, size=10, random=True, seed=42, full=False):\n",
    "    \"\"\"Positive Tool Sampler: Wrapper to handle positive dataset augmentation with additional arguments.\"\"\"\n",
    "    try:\n",
    "        if use_default:\n",
    "            input_csv_path = f\"{PROJECT_ROOT}/data/crossref-preprint-article-relationships-Aug-2023.csv\"\n",
    "            if not Path(input_csv_path).exists():\n",
    "                return \"Error: Default CSV file not found!\", None, gr.update(visible=False)\n",
    "        elif csv_file is not None:\n",
    "            input_csv_path = csv_file.name\n",
    "        else:\n",
    "            return \"Error: Please select default or upload a CSV file.\", None, gr.update(visible=False)\n",
    "\n",
    "        # Call the positive sampler function with additional arguments\n",
    "        augmented_df = positive_sampler(\n",
    "            optional_path=input_csv_path, \n",
    "            size=size, \n",
    "            random=random, \n",
    "            seed=seed, \n",
    "            full=full\n",
    "        )\n",
    "        output_csv_path = \"positive_augmented_dataset.csv\"\n",
    "        augmented_df.to_csv(output_csv_path, index=False)\n",
    "\n",
    "        return output_csv_path, augmented_df.head(), gr.update(visible=True)\n",
    "\n",
    "    except Exception as e:\n",
    "        return f\"Error during processing: {str(e)}\", None, gr.update(visible=False)\n",
    "\n",
    "\n",
    "def reset_output():\n",
    "    \"\"\"Resets the output fields by returning None and hiding the DataFrame.\"\"\"\n",
    "    return None, None, gr.update(visible=False)\n",
    "\n",
    "with gr.Blocks(css=f\"\"\"\n",
    "    .gradio-container {{\n",
    "        font-family: Arial, sans-serif;\n",
    "        max-width: 900px;\n",
    "        margin: auto;\n",
    "    }}\n",
    "    h1 {{\n",
    "        text-align: center;\n",
    "        color: white;\n",
    "        font-size: 60px;\n",
    "        margin-bottom: 0px;\n",
    "    }}\n",
    "    h2 {{\n",
    "        text-align: center;\n",
    "        color: #ff0000;\n",
    "        font-size: 16px;\n",
    "        font-weight: normal;\n",
    "        margin-top: 0px;\n",
    "    }}\n",
    "    .title {{\n",
    "        text-align: center;\n",
    "        font-size: 40px;\n",
    "        margin-top: 30px;\n",
    "        margin-bottom: 20px;\n",
    "    }}\n",
    "    .title .positive {{\n",
    "        color: #ff0000;\n",
    "    }}\n",
    "    .title .negative {{\n",
    "        color: #ff0000;\n",
    "    }}\n",
    "    .title .tool {{\n",
    "        color: white;\n",
    "    }}\n",
    "    .title .sampler {{\n",
    "        color: #ff0000;\n",
    "    }}\n",
    "    .description {{\n",
    "        text-align: center;\n",
    "        margin-bottom: 20px;\n",
    "    }}\n",
    "    #submit-button {{\n",
    "        background-color: #ff0000;\n",
    "        color: white;\n",
    "        font-size: 16px;\n",
    "        border: none;\n",
    "        border-radius: 5px;\n",
    "        padding: 10px 20px;\n",
    "    }}\n",
    "    #reset-button {{\n",
    "        background-color: #d3d3d3;\n",
    "        color: black;\n",
    "        font-size: 16px;\n",
    "        border: none;\n",
    "        border-radius: 5px;\n",
    "        padding: 10px 20px;\n",
    "    }}\n",
    "\"\"\") as app:\n",
    "    # Main Title Section\n",
    "    gr.Markdown(\"\"\"\n",
    "    <h1>ENTC</h1>\n",
    "    <h2>Entrepreneurship and Technology Commercialization · EPFL</h2>\n",
    "    \"\"\")\n",
    "\n",
    "    # Positive Tool Sampler Section\n",
    "    gr.Markdown(\"\"\"\n",
    "    <div class=\"title\">\n",
    "        <span class=\"positive\">Positive</span>\n",
    "        <span class=\"tool\">Tool</span>\n",
    "        <span class=\"sampler\">Sampler</span>\n",
    "    </div>\n",
    "    \"\"\")\n",
    "\n",
    "    gr.Markdown(\"\"\"\n",
    "    <p class=\"description\">\n",
    "    This tool takes a list of DOIs and augments them using the OpenAlex API.\n",
    "    It is designed to complement the Negative Tool Sampler, enabling the creation of complete datasets.\n",
    "    </p>\n",
    "    \"\"\")\n",
    "\n",
    "    with gr.Group():\n",
    "        with gr.Row():\n",
    "            pos_use_default_checkbox = gr.Checkbox(label=\"Use Default Dataset\", value=True)\n",
    "            pos_csv_file_input = gr.File(label=\"Upload CSV (optional)\", file_types=[\".csv\"], visible=False)\n",
    "\n",
    "        with gr.Row():\n",
    "            size_input = gr.Number(label=\"Number of Samples\", value=10, info=\"Specify the number of samples to generate.\")\n",
    "            random_input = gr.Checkbox(label=\"Sample Randomly\", value=True, info=\"Whether to sample randomly.\")\n",
    "            seed_input = gr.Number(label=\"Random Seed\", value=42, info=\"Random seed for reproducibility.\")\n",
    "            full_input = gr.Checkbox(label=\"Full Dataset Mode\", value=False, info=\"Indicate whether to use the full dataset.\")\n",
    "\n",
    "        with gr.Group():\n",
    "            pos_output_file = gr.File(label=\"Download Augmented Dataset\")\n",
    "            pos_dataset_preview = gr.DataFrame(label=\"Dataset Preview\", interactive=False, visible=False)\n",
    "            with gr.Row():\n",
    "                pos_submit_button = gr.Button(\"Submit 🚀\", elem_id=\"submit-button\")\n",
    "                pos_reset_button = gr.Button(\"Reset 🔄\", elem_id=\"reset-button\")\n",
    "\n",
    "        # Button Actions\n",
    "        pos_submit_button.click(\n",
    "            positive_sampler_interface,\n",
    "            inputs=[pos_use_default_checkbox, pos_csv_file_input, size_input, random_input, seed_input, full_input],\n",
    "            outputs=[pos_output_file, pos_dataset_preview, pos_dataset_preview]\n",
    "        )\n",
    "\n",
    "        pos_reset_button.click(\n",
    "            reset_output,\n",
    "            inputs=[],\n",
    "            outputs=[pos_output_file, pos_dataset_preview, pos_dataset_preview]\n",
    "        )\n",
    "\n",
    "        # Toggle File Input\n",
    "        def toggle_pos_csv_input(use_default):\n",
    "            return gr.update(visible=not use_default)\n",
    "\n",
    "        pos_use_default_checkbox.change(\n",
    "            toggle_pos_csv_input,\n",
    "            inputs=[pos_use_default_checkbox],\n",
    "            outputs=[pos_csv_file_input]\n",
    "        )\n",
    "\n",
    "    # Negative Tool Sampler Section\n",
    "    gr.Markdown(\"\"\"\n",
    "    <div class=\"title\">\n",
    "        <span class=\"negative\">Negative</span>\n",
    "        <span class=\"tool\">Tool</span>\n",
    "        <span class=\"sampler\">Sampler</span>\n",
    "    </div>\n",
    "    \"\"\")\n",
    "\n",
    "    gr.Markdown(\"\"\"\n",
    "    <p class=\"description\">\n",
    "    This tool generates datasets by creating negative samples from positive matches between preprints and articles.\n",
    "    Customize the difficulty and the augmentation factor to meet your needs.\n",
    "    </p>\n",
    "    \"\"\")\n",
    "\n",
    "    with gr.Group():\n",
    "        with gr.Row():\n",
    "            factor_input = gr.Number(\n",
    "                label=\"Factor (int)\", value=1, info=\"Specify the number of negative samples per positive sample.\"\n",
    "            )\n",
    "            type_dropdown = gr.Dropdown(\n",
    "                [\"random\", \"similar topics\", \"overlapping authors\", \"random authors\", \"fuzzed title\"],\n",
    "                label=\"Select Difficulty or Augmentation Type\"\n",
    "            )\n",
    "        with gr.Row():\n",
    "            use_default_checkbox = gr.Checkbox(label=\"Use Default Dataset\", value=True)\n",
    "            csv_file_input = gr.File(label=\"Upload CSV (optional)\", file_types=[\".csv\"], visible=False)\n",
    "\n",
    "        with gr.Group():\n",
    "            output_file = gr.File(label=\"Download Augmented Dataset\")\n",
    "            dataset_preview = gr.DataFrame(label=\"Dataset Preview\", interactive=False, visible=False)\n",
    "            with gr.Row():\n",
    "                submit_button = gr.Button(\"Submit 🚀\", elem_id=\"submit-button\")\n",
    "                reset_button = gr.Button(\"Reset 🔄\", elem_id=\"reset-button\")\n",
    "\n",
    "        # Button Actions\n",
    "        submit_button.click(\n",
    "            augment_interface,\n",
    "            inputs=[factor_input, type_dropdown, use_default_checkbox, csv_file_input],\n",
    "            outputs=[output_file, dataset_preview, dataset_preview]\n",
    "        )\n",
    "\n",
    "        reset_button.click(\n",
    "            reset_output,\n",
    "            inputs=[],\n",
    "            outputs=[output_file, dataset_preview, dataset_preview]\n",
    "        )\n",
    "\n",
    "        # Toggle File Input\n",
    "        def toggle_csv_input(use_default):\n",
    "            return gr.update(visible=not use_default)\n",
    "\n",
    "        use_default_checkbox.change(\n",
    "            toggle_csv_input,\n",
    "            inputs=[use_default_checkbox],\n",
    "            outputs=[csv_file_input]\n",
    "        )\n",
    "\n",
    "# Launch the app\n",
    "if __name__ == \"__main__\":\n",
    "    app.launch(share=True)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "marple",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}