{
 "cells": [
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "# Data Stats",
   "id": "694a6cc631d4ab93"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-15T18:43:07.644299Z",
     "start_time": "2024-10-15T18:43:02.316453Z"
    }
   },
   "cell_type": "code",
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "\n",
    "df = load_dataset(\"JetBrains-Research/synthetic-commit-msg-edits\", \"all_pairs\", split=\"train\").to_pandas()\n",
    "df.head()"
   ],
   "id": "ed42f4f83199feb2",
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading data: 100%|██████████| 6.35M/6.35M [00:00<00:00, 9.95MB/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Generating train split: 0 examples [00:00, ? examples/s]"
      ],
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "1a0523289d424b29974b60d017643280"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "                                       hash              repo  \\\n",
       "0  2febb99eee8ed71c9122db88ca58dd33be0b9550  mesonbuild/meson   \n",
       "1  2febb99eee8ed71c9122db88ca58dd33be0b9550  mesonbuild/meson   \n",
       "2  2febb99eee8ed71c9122db88ca58dd33be0b9550  mesonbuild/meson   \n",
       "3  2febb99eee8ed71c9122db88ca58dd33be0b9550  mesonbuild/meson   \n",
       "4  2febb99eee8ed71c9122db88ca58dd33be0b9550  mesonbuild/meson   \n",
       "\n",
       "                                              G_text  \\\n",
       "0  Enhance OptionOverrideProxy and simplify optio...   \n",
       "1  Enhance OptionOverrideProxy and simplify optio...   \n",
       "2  Enhance OptionOverrideProxy and simplify optio...   \n",
       "3  Enhance OptionOverrideProxy and simplify optio...   \n",
       "4  Enhance OptionOverrideProxy and simplify optio...   \n",
       "\n",
       "                                              E_text              G_type  \\\n",
       "0  Enhance OptionOverrideProxy for multiple optio...  synthetic_backward   \n",
       "1  Refactor OptionOverrideProxy and Backend class...  synthetic_backward   \n",
       "2  Refactor OptionOverrideProxy and backend optio...  synthetic_backward   \n",
       "3  Refactor: Enhance OptionOverrideProxy for mult...  synthetic_backward   \n",
       "4  Refactor OptionOverrideProxy and add target-sp...  synthetic_backward   \n",
       "\n",
       "                            E_type  is_related  \n",
       "0                   expert_labeled        True  \n",
       "1                synthetic_forward        True  \n",
       "2                synthetic_forward        True  \n",
       "3                synthetic_forward        True  \n",
       "4  synthetic_forward_from_backward       False  "
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>hash</th>\n",
       "      <th>repo</th>\n",
       "      <th>G_text</th>\n",
       "      <th>E_text</th>\n",
       "      <th>G_type</th>\n",
       "      <th>E_type</th>\n",
       "      <th>is_related</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2febb99eee8ed71c9122db88ca58dd33be0b9550</td>\n",
       "      <td>mesonbuild/meson</td>\n",
       "      <td>Enhance OptionOverrideProxy and simplify optio...</td>\n",
       "      <td>Enhance OptionOverrideProxy for multiple optio...</td>\n",
       "      <td>synthetic_backward</td>\n",
       "      <td>expert_labeled</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2febb99eee8ed71c9122db88ca58dd33be0b9550</td>\n",
       "      <td>mesonbuild/meson</td>\n",
       "      <td>Enhance OptionOverrideProxy and simplify optio...</td>\n",
       "      <td>Refactor OptionOverrideProxy and Backend class...</td>\n",
       "      <td>synthetic_backward</td>\n",
       "      <td>synthetic_forward</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2febb99eee8ed71c9122db88ca58dd33be0b9550</td>\n",
       "      <td>mesonbuild/meson</td>\n",
       "      <td>Enhance OptionOverrideProxy and simplify optio...</td>\n",
       "      <td>Refactor OptionOverrideProxy and backend optio...</td>\n",
       "      <td>synthetic_backward</td>\n",
       "      <td>synthetic_forward</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2febb99eee8ed71c9122db88ca58dd33be0b9550</td>\n",
       "      <td>mesonbuild/meson</td>\n",
       "      <td>Enhance OptionOverrideProxy and simplify optio...</td>\n",
       "      <td>Refactor: Enhance OptionOverrideProxy for mult...</td>\n",
       "      <td>synthetic_backward</td>\n",
       "      <td>synthetic_forward</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2febb99eee8ed71c9122db88ca58dd33be0b9550</td>\n",
       "      <td>mesonbuild/meson</td>\n",
       "      <td>Enhance OptionOverrideProxy and simplify optio...</td>\n",
       "      <td>Refactor OptionOverrideProxy and add target-sp...</td>\n",
       "      <td>synthetic_backward</td>\n",
       "      <td>synthetic_forward_from_backward</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 3
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "## Full",
   "id": "922e7a73f11a4aec"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-15T18:43:14.266540Z",
     "start_time": "2024-10-15T18:43:14.262103Z"
    }
   },
   "cell_type": "code",
   "source": "len(df.loc[df.is_related])",
   "id": "562d9c53da109d1a",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "656"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 4
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-15T18:43:18.073966Z",
     "start_time": "2024-10-15T18:43:18.069219Z"
    }
   },
   "cell_type": "code",
   "source": "df.loc[df.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
   "id": "b4f3c96a4b676a0d",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "43.733333333333334"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 5
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-15T18:43:19.026689Z",
     "start_time": "2024-10-15T18:43:19.021680Z"
    }
   },
   "cell_type": "code",
   "source": "len(df.loc[~df.is_related])",
   "id": "54d9f32f1d18844f",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5140"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 6
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-15T18:43:19.484304Z",
     "start_time": "2024-10-15T18:43:19.480012Z"
    }
   },
   "cell_type": "code",
   "source": "df.loc[~df.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
   "id": "679761631517b9e4",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "342.6666666666667"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 7
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "## Expert-labeled",
   "id": "84561ea89717d61a"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-15T18:45:52.905631Z",
     "start_time": "2024-10-15T18:45:52.901913Z"
    }
   },
   "cell_type": "code",
   "source": "_ = df.loc[(df.G_type == \"initial\") & (df.E_type == \"expert_labeled\")]",
   "id": "be1c800f45cef26e",
   "outputs": [],
   "execution_count": 36
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-15T18:45:53.234109Z",
     "start_time": "2024-10-15T18:45:53.230986Z"
    }
   },
   "cell_type": "code",
   "source": "len(_.loc[_.is_related])",
   "id": "1d092dff4d39bcd1",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "57"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 37
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-15T18:45:53.629311Z",
     "start_time": "2024-10-15T18:45:53.625620Z"
    }
   },
   "cell_type": "code",
   "source": "_.loc[_.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
   "id": "a06a532cd5a29725",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3.8"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 38
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-15T18:45:53.956790Z",
     "start_time": "2024-10-15T18:45:53.953842Z"
    }
   },
   "cell_type": "code",
   "source": "len(_.loc[~_.is_related])",
   "id": "5e19c8a6309b62aa",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 39
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-15T18:46:02.554527Z",
     "start_time": "2024-10-15T18:46:02.551084Z"
    }
   },
   "cell_type": "code",
   "source": "_.loc[~_.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
   "id": "e43179c5dcab5eb2",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "nan"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 40
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "## Backward",
   "id": "70ee052fae2f88e3"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-15T18:44:33.559606Z",
     "start_time": "2024-10-15T18:44:33.556802Z"
    }
   },
   "cell_type": "code",
   "source": "_ = df.loc[(df.G_type == \"synthetic_backward\") & (~df.E_type.isin([\"synthetic_forward\", \"synthetic_forward_from_backward\"]))]",
   "id": "99f51ecc55c4db35",
   "outputs": [],
   "execution_count": 20
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-15T18:44:33.958325Z",
     "start_time": "2024-10-15T18:44:33.955847Z"
    }
   },
   "cell_type": "code",
   "source": "len(_.loc[_.is_related])",
   "id": "6ff29390c8e127c2",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "104"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 21
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-15T18:44:34.455560Z",
     "start_time": "2024-10-15T18:44:34.452303Z"
    }
   },
   "cell_type": "code",
   "source": "_.loc[_.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
   "id": "e1ae04e1ecfb2040",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "7.428571428571429"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 22
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-15T18:44:34.903849Z",
     "start_time": "2024-10-15T18:44:34.901226Z"
    }
   },
   "cell_type": "code",
   "source": "len(_.loc[~_.is_related])",
   "id": "125c4c335e7761da",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1048"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 23
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-15T18:44:35.783538Z",
     "start_time": "2024-10-15T18:44:35.778676Z"
    }
   },
   "cell_type": "code",
   "source": "_.loc[~_.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
   "id": "4782f1d6e6863f89",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "74.85714285714286"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 24
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "## Forward",
   "id": "bf61a4b422f779fa"
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "### From human",
   "id": "1429f9f99acf75d"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-15T18:46:21.359807Z",
     "start_time": "2024-10-15T18:46:21.356451Z"
    }
   },
   "cell_type": "code",
   "source": "_ = df.loc[(df.G_type == \"initial\") & (df.E_type == \"synthetic_forward\")]",
   "id": "e13d55b0124f04b3",
   "outputs": [],
   "execution_count": 41
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-15T18:46:21.798508Z",
     "start_time": "2024-10-15T18:46:21.795885Z"
    }
   },
   "cell_type": "code",
   "source": "len(_.loc[_.is_related])",
   "id": "b8353390df7da427",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "177"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 42
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-15T18:46:22.163595Z",
     "start_time": "2024-10-15T18:46:22.160176Z"
    }
   },
   "cell_type": "code",
   "source": "_.loc[_.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
   "id": "ac89afde65efd73d",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "11.8"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 43
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-15T18:46:22.552314Z",
     "start_time": "2024-10-15T18:46:22.549570Z"
    }
   },
   "cell_type": "code",
   "source": "len(_.loc[~_.is_related])",
   "id": "9b6cb335e3bbb7ff",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 44
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-15T18:46:23.237736Z",
     "start_time": "2024-10-15T18:46:23.234085Z"
    }
   },
   "cell_type": "code",
   "source": "__.loc[~__.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
   "id": "fe22189a70fc4149",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "nan"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 45
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "### From backward",
   "id": "ace7afb876fb88a0"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-15T18:47:06.641374Z",
     "start_time": "2024-10-15T18:47:06.637018Z"
    }
   },
   "cell_type": "code",
   "source": "_ = df.loc[(df.G_type == \"synthetic_backward\") & (df.E_type.isin([\"synthetic_forward\", \"synthetic_forward_from_backward\"]))]",
   "id": "88800960dbff619a",
   "outputs": [],
   "execution_count": 53
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-15T18:47:15.358650Z",
     "start_time": "2024-10-15T18:47:15.355108Z"
    }
   },
   "cell_type": "code",
   "source": "len(_.loc[_.is_related])",
   "id": "890613156e005c83",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "318"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 56
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-15T18:47:15.579415Z",
     "start_time": "2024-10-15T18:47:15.576016Z"
    }
   },
   "cell_type": "code",
   "source": "_.loc[_.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
   "id": "999f91382a2c8ff6",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "22.714285714285715"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 57
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-15T18:47:15.834218Z",
     "start_time": "2024-10-15T18:47:15.831258Z"
    }
   },
   "cell_type": "code",
   "source": "len(_.loc[~_.is_related])",
   "id": "d347941cbb4b2db1",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3753"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 58
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-15T18:47:16.138798Z",
     "start_time": "2024-10-15T18:47:16.133397Z"
    }
   },
   "cell_type": "code",
   "source": "_.loc[~_.is_related].groupby([\"hash\", \"repo\"]).G_text.count().mean()",
   "id": "2db4d96713a8634d",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "268.07142857142856"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 59
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}