{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "D4Yal6HOyyBt" }, "source": [ "# Importing Libraries & loading data" ] }, { "cell_type": "code", "execution_count": 84, "metadata": { "id": "Sesct3fTzQVW" }, "outputs": [], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder\n", "import xgboost as xgb\n", "from sklearn.model_selection import train_test_split, GridSearchCV\n", "import numpy as np\n", "from sklearn.metrics import mean_squared_error\n", "from xgboost import XGBRegressor\n", "import calendar\n", "import pickle" ] }, { "cell_type": "markdown", "metadata": { "id": "h5D2aP28yyBw" }, "source": [ "# Checking the data for null values" ] }, { "cell_type": "code", "execution_count": 85, "metadata": { "id": "WGqXaM-XzQVX" }, "outputs": [], "source": [ "parent_df = pd.read_csv(\"/content/monatszahlen2412_verkehrsunfaelle_06_12_24.csv\")\n" ] }, { "cell_type": "code", "execution_count": 86, "metadata": { "id": "zBBpsXqszQVY", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "a61d9595-6efe-4111-9192-529e54dd7fe3" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "RangeIndex: 2254 entries, 0 to 2253\n", "Data columns (total 9 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 MONATSZAHL 2254 non-null object \n", " 1 AUSPRAEGUNG 2254 non-null object \n", " 2 JAHR 2254 non-null int64 \n", " 3 MONAT 2254 non-null object \n", " 4 WERT 2086 non-null float64\n", " 5 VORJAHRESWERT 2086 non-null float64\n", " 6 VERAEND_VORMONAT_PROZENT 1924 non-null float64\n", " 7 VERAEND_VORJAHRESMONAT_PROZENT 2001 non-null float64\n", " 8 ZWOELF_MONATE_MITTELWERT 1932 non-null float64\n", "dtypes: float64(5), int64(1), object(3)\n", "memory usage: 158.6+ KB\n", "None\n", "MONATSZAHL 0\n", "AUSPRAEGUNG 0\n", "JAHR 0\n", "MONAT 0\n", "WERT 168\n", "VORJAHRESWERT 168\n", "VERAEND_VORMONAT_PROZENT 330\n", "VERAEND_VORJAHRESMONAT_PROZENT 253\n", "ZWOELF_MONATE_MITTELWERT 322\n", "dtype: int64\n" ] } ], "source": [ "print(parent_df.info())\n", "print(parent_df.isna().sum())\n" ] }, { "cell_type": "markdown", "source": [ "# Only using the 5 columns since only they are deemed important in the instructions\n", "\n", "Important are the first 5 columns:\n", "Category\n", "Accident-type (insgesamt means total for all subcategories)\n", "Year\n", "Month\n", "Value\n" ], "metadata": { "id": "3beiNb3QAS_M" } }, { "cell_type": "code", "source": [ "parent_df = parent_df[['MONATSZAHL', 'AUSPRAEGUNG', 'JAHR', 'MONAT', 'WERT']]\n", "\n", "print(f\"Unqiue values of MONATSZAHL: {parent_df['MONATSZAHL'].unique()}\")\n", "print(f\"Unqiue values of AUSPRAEGUNG: {parent_df['AUSPRAEGUNG'].unique()}\")\n", "print(f\"Unqiue valus of JAHR: {parent_df['JAHR'].unique()}\")\n", "print(f\"Unqiue valus of MONAT: {parent_df['MONAT'].unique()}\")\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "xTZE4w5qASTn", "outputId": "60696fe6-ee69-4ca6-b947-5d491b91a1b7" }, "execution_count": 87, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Unqiue values of MONATSZAHL: ['Alkoholunfälle' 'Fluchtunfälle' 'Verkehrsunfälle']\n", "Unqiue values of AUSPRAEGUNG: ['insgesamt' 'Verletzte und Getötete' 'mit Personenschäden']\n", "Unqiue valus of JAHR: [2024 2023 2022 2021 2020 2019 2018 2017 2016 2015 2014 2013 2012 2011\n", " 2010 2009 2008 2007 2006 2005 2004 2003 2002 2001 2000]\n", "Unqiue valus of MONAT: ['202401' '202402' '202403' '202404' '202405' '202406' '202407' '202408'\n", " '202409' '202410' '202411' '202412' '202301' '202302' '202303' '202304'\n", " '202305' '202306' '202307' '202308' '202309' '202310' '202311' '202312'\n", " 'Summe' '202201' '202202' '202203' '202204' '202205' '202206' '202207'\n", " '202208' '202209' '202210' '202211' '202212' '202101' '202102' '202103'\n", " '202104' '202105' '202106' '202107' '202108' '202109' '202110' '202111'\n", " '202112' '202001' '202002' '202003' '202004' '202005' '202006' '202007'\n", " '202008' '202009' '202010' '202011' '202012' '201901' '201902' '201903'\n", " '201904' '201905' '201906' '201907' '201908' '201909' '201910' '201911'\n", " '201912' '201801' '201802' '201803' '201804' '201805' '201806' '201807'\n", " '201808' '201809' '201810' '201811' '201812' '201701' '201702' '201703'\n", " '201704' '201705' '201706' '201707' '201708' '201709' '201710' '201711'\n", " '201712' '201601' '201602' '201603' '201604' '201605' '201606' '201607'\n", " '201608' '201609' '201610' '201611' '201612' '201501' '201502' '201503'\n", " '201504' '201505' '201506' '201507' '201508' '201509' '201510' '201511'\n", " '201512' '201401' '201402' '201403' '201404' '201405' '201406' '201407'\n", " '201408' '201409' '201410' '201411' '201412' '201301' '201302' '201303'\n", " '201304' '201305' '201306' '201307' '201308' '201309' '201310' '201311'\n", " '201312' '201201' '201202' '201203' '201204' '201205' '201206' '201207'\n", " '201208' '201209' '201210' '201211' '201212' '201101' '201102' '201103'\n", " '201104' '201105' '201106' '201107' '201108' '201109' '201110' '201111'\n", " '201112' '201001' '201002' '201003' '201004' '201005' '201006' '201007'\n", " '201008' '201009' '201010' '201011' '201012' '200901' '200902' '200903'\n", " '200904' '200905' '200906' '200907' '200908' '200909' '200910' '200911'\n", " '200912' '200801' '200802' '200803' '200804' '200805' '200806' '200807'\n", " '200808' '200809' '200810' '200811' '200812' '200701' '200702' '200703'\n", " '200704' '200705' '200706' '200707' '200708' '200709' '200710' '200711'\n", " '200712' '200601' '200602' '200603' '200604' '200605' '200606' '200607'\n", " '200608' '200609' '200610' '200611' '200612' '200501' '200502' '200503'\n", " '200504' '200505' '200506' '200507' '200508' '200509' '200510' '200511'\n", " '200512' '200401' '200402' '200403' '200404' '200405' '200406' '200407'\n", " '200408' '200409' '200410' '200411' '200412' '200301' '200302' '200303'\n", " '200304' '200305' '200306' '200307' '200308' '200309' '200310' '200311'\n", " '200312' '200201' '200202' '200203' '200204' '200205' '200206' '200207'\n", " '200208' '200209' '200210' '200211' '200212' '200101' '200102' '200103'\n", " '200104' '200105' '200106' '200107' '200108' '200109' '200110' '200111'\n", " '200112' '200001' '200002' '200003' '200004' '200005' '200006' '200007'\n", " '200008' '200009' '200010' '200011' '200012']\n" ] } ] }, { "cell_type": "markdown", "source": [ "# Dropping the rows where the year value is after 2020" ], "metadata": { "id": "-6-k9bkqAeCn" } }, { "cell_type": "code", "execution_count": 88, "metadata": { "id": "r5SG6rjMzQVY", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "9a45ba3d-234c-49ca-e722-bb97db5348d3" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Unqiue values of JAHR: [2019 2018 2017 2016 2015 2014 2013 2012 2011 2010 2009 2008 2007 2006\n", " 2005 2004 2003 2002 2001 2000]\n" ] } ], "source": [ "parent_df = parent_df[parent_df['JAHR']<2020]\n", "\n", "print(f\"Unqiue values of JAHR: {parent_df['JAHR'].unique()}\")" ] }, { "cell_type": "code", "source": [ "parent_df.head(2)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 112 }, "id": "jWNecV_uAkkq", "outputId": "501a5e7a-6f21-4fc2-8137-d562e0b6bd5d" }, "execution_count": 89, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " MONATSZAHL AUSPRAEGUNG JAHR MONAT WERT\n", "63 Alkoholunfälle insgesamt 2019 Summe 434.0\n", "64 Alkoholunfälle insgesamt 2019 201901 22.0" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MONATSZAHLAUSPRAEGUNGJAHRMONATWERT
63Alkoholunfälleinsgesamt2019Summe434.0
64Alkoholunfälleinsgesamt201920190122.0
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "parent_df", "summary": "{\n \"name\": \"parent_df\",\n \"rows\": 1813,\n \"fields\": [\n {\n \"column\": \"MONATSZAHL\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Alkoholunf\\u00e4lle\",\n \"Fluchtunf\\u00e4lle\",\n \"Verkehrsunf\\u00e4lle\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AUSPRAEGUNG\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"insgesamt\",\n \"Verletzte und Get\\u00f6tete\",\n \"mit Personensch\\u00e4den\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"JAHR\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 5,\n \"min\": 2000,\n \"max\": 2019,\n \"num_unique_values\": 20,\n \"samples\": [\n 2019,\n 2002,\n 2004\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"MONAT\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 241,\n \"samples\": [\n \"201812\",\n \"201906\",\n \"200106\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WERT\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 4565.788307713133,\n \"min\": 0.0,\n \"max\": 46988.0,\n \"num_unique_values\": 903,\n \"samples\": [\n 74.0,\n 4074.0,\n 951.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 89 } ] }, { "cell_type": "code", "source": [ "parent_df.reset_index(drop=True, inplace=True)\n", "\n", "parent_df" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "id": "vYRSRdsCAmRY", "outputId": "7810682a-213e-4df6-e32a-15a3fecc1b67" }, "execution_count": 90, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " MONATSZAHL AUSPRAEGUNG JAHR MONAT WERT\n", "0 Alkoholunfälle insgesamt 2019 Summe 434.0\n", "1 Alkoholunfälle insgesamt 2019 201901 22.0\n", "2 Alkoholunfälle insgesamt 2019 201902 28.0\n", "3 Alkoholunfälle insgesamt 2019 201903 34.0\n", "4 Alkoholunfälle insgesamt 2019 201904 36.0\n", "... ... ... ... ... ...\n", "1808 Verkehrsunfälle Verletzte und Getötete 2000 200008 647.0\n", "1809 Verkehrsunfälle Verletzte und Getötete 2000 200009 675.0\n", "1810 Verkehrsunfälle Verletzte und Getötete 2000 200010 615.0\n", "1811 Verkehrsunfälle Verletzte und Getötete 2000 200011 578.0\n", "1812 Verkehrsunfälle Verletzte und Getötete 2000 200012 515.0\n", "\n", "[1813 rows x 5 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MONATSZAHLAUSPRAEGUNGJAHRMONATWERT
0Alkoholunfälleinsgesamt2019Summe434.0
1Alkoholunfälleinsgesamt201920190122.0
2Alkoholunfälleinsgesamt201920190228.0
3Alkoholunfälleinsgesamt201920190334.0
4Alkoholunfälleinsgesamt201920190436.0
..................
1808VerkehrsunfälleVerletzte und Getötete2000200008647.0
1809VerkehrsunfälleVerletzte und Getötete2000200009675.0
1810VerkehrsunfälleVerletzte und Getötete2000200010615.0
1811VerkehrsunfälleVerletzte und Getötete2000200011578.0
1812VerkehrsunfälleVerletzte und Getötete2000200012515.0
\n", "

1813 rows × 5 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "parent_df", "summary": "{\n \"name\": \"parent_df\",\n \"rows\": 1813,\n \"fields\": [\n {\n \"column\": \"MONATSZAHL\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Alkoholunf\\u00e4lle\",\n \"Fluchtunf\\u00e4lle\",\n \"Verkehrsunf\\u00e4lle\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AUSPRAEGUNG\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"insgesamt\",\n \"Verletzte und Get\\u00f6tete\",\n \"mit Personensch\\u00e4den\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"JAHR\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 5,\n \"min\": 2000,\n \"max\": 2019,\n \"num_unique_values\": 20,\n \"samples\": [\n 2019,\n 2002,\n 2004\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"MONAT\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 241,\n \"samples\": [\n \"201812\",\n \"201906\",\n \"200106\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WERT\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 4565.788307713133,\n \"min\": 0.0,\n \"max\": 46988.0,\n \"num_unique_values\": 903,\n \"samples\": [\n 74.0,\n 4074.0,\n 951.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 90 } ] }, { "cell_type": "markdown", "metadata": { "id": "mixK8LmRyyBx" }, "source": [ "# Since its a regression task, it always helps to remove outliers from the dataset. it will just exclude values that dont lie near the rest of the data points, making the distribution even more better" ] }, { "cell_type": "code", "execution_count": 91, "metadata": { "id": "jJVSlcgGzQVZ" }, "outputs": [], "source": [ "columns = parent_df.select_dtypes(include=[np.number]).columns\n", "\n", "df = parent_df.copy()\n", "\n", "for col in columns:\n", " z_scores = np.abs((df[col] - df[col].mean()) / df[col].std())\n", " df = df[z_scores < 3]" ] }, { "cell_type": "code", "execution_count": 92, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "hYPXBmXfzQVa", "outputId": "7686e12f-d8eb-48c2-91ae-2060e2d0bef0" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array(['Summe', '201901', '201902', '201903', '201904', '201905',\n", " '201906', '201907', '201908', '201909', '201910', '201911',\n", " '201912', '201801', '201802', '201803', '201804', '201805',\n", " '201806', '201807', '201808', '201809', '201810', '201811',\n", " '201812', '201701', '201702', '201703', '201704', '201705',\n", " '201706', '201707', '201708', '201709', '201710', '201711',\n", " '201712', '201601', '201602', '201603', '201604', '201605',\n", " '201606', '201607', '201608', '201609', '201610', '201611',\n", " '201612', '201501', '201502', '201503', '201504', '201505',\n", " '201506', '201507', '201508', '201509', '201510', '201511',\n", " '201512', '201401', '201402', '201403', '201404', '201405',\n", " '201406', '201407', '201408', '201409', '201410', '201411',\n", " '201412', '201301', '201302', '201303', '201304', '201305',\n", " '201306', '201307', '201308', '201309', '201310', '201311',\n", " '201312', '201201', '201202', '201203', '201204', '201205',\n", " '201206', '201207', '201208', '201209', '201210', '201211',\n", " '201212', '201101', '201102', '201103', '201104', '201105',\n", " '201106', '201107', '201108', '201109', '201110', '201111',\n", " '201112', '201001', '201002', '201003', '201004', '201005',\n", " '201006', '201007', '201008', '201009', '201010', '201011',\n", " '201012', '200901', '200902', '200903', '200904', '200905',\n", " '200906', '200907', '200908', '200909', '200910', '200911',\n", " '200912', '200801', '200802', '200803', '200804', '200805',\n", " '200806', '200807', '200808', '200809', '200810', '200811',\n", " '200812', '200701', '200702', '200703', '200704', '200705',\n", " '200706', '200707', '200708', '200709', '200710', '200711',\n", " '200712', '200601', '200602', '200603', '200604', '200605',\n", " '200606', '200607', '200608', '200609', '200610', '200611',\n", " '200612', '200501', '200502', '200503', '200504', '200505',\n", " '200506', '200507', '200508', '200509', '200510', '200511',\n", " '200512', '200401', '200402', '200403', '200404', '200405',\n", " '200406', '200407', '200408', '200409', '200410', '200411',\n", " '200412', '200301', '200302', '200303', '200304', '200305',\n", " '200306', '200307', '200308', '200309', '200310', '200311',\n", " '200312', '200201', '200202', '200203', '200204', '200205',\n", " '200206', '200207', '200208', '200209', '200210', '200211',\n", " '200212', '200101', '200102', '200103', '200104', '200105',\n", " '200106', '200107', '200108', '200109', '200110', '200111',\n", " '200112', '200001', '200002', '200003', '200004', '200005',\n", " '200006', '200007', '200008', '200009', '200010', '200011',\n", " '200012'], dtype=object)" ] }, "metadata": {}, "execution_count": 92 } ], "source": [ "df['MONAT'].unique()" ] }, { "cell_type": "markdown", "source": [ "# Observing how values i.e 'WERT' is distributed based on different columns" ], "metadata": { "id": "nN0BPCJfDrz-" } }, { "cell_type": "code", "source": [ "df['MONATSZAHL'].hist(bins=30, alpha=0.7)\n", "plt.title('distribution of accident categories')\n", "plt.show()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 452 }, "id": "myFIxEpkDtT5", "outputId": "aa5259a4-20b1-48b4-d9a0-75b26c930d45" }, "execution_count": 93, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "df['AUSPRAEGUNG'].hist(bins=30, alpha=0.7)\n", "plt.show()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 430 }, "id": "qI8yKHJgDw6p", "outputId": "c364e179-1c6e-49fe-cb1c-3a7e4addc341" }, "execution_count": 94, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "markdown", "source": [ "# i observed that values are distributed almost uniformly for col JAHR but for column MONAT its a different case" ], "metadata": { "id": "KDJVmnwkD3RV" } }, { "cell_type": "code", "source": [ "df['JAHR'].value_counts()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 743 }, "id": "vbmMkR5pD2i6", "outputId": "65e87d35-7952-44aa-80e2-67568eecb6cf" }, "execution_count": 95, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "JAHR\n", "2019 90\n", "2018 90\n", "2001 90\n", "2002 90\n", "2003 90\n", "2004 90\n", "2005 90\n", "2006 90\n", "2007 90\n", "2008 90\n", "2009 90\n", "2010 90\n", "2011 90\n", "2012 90\n", "2013 90\n", "2014 90\n", "2015 90\n", "2016 90\n", "2017 90\n", "2000 84\n", "Name: count, dtype: int64" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
count
JAHR
201990
201890
200190
200290
200390
200490
200590
200690
200790
200890
200990
201090
201190
201290
201390
201490
201590
201690
201790
200084
\n", "

" ] }, "metadata": {}, "execution_count": 95 } ] }, { "cell_type": "code", "source": [ "df['MONAT'].value_counts()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 490 }, "id": "cJyIFAQ1D6iU", "outputId": "c443171e-8f63-4b30-e756-b773706c23fb" }, "execution_count": 96, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "MONAT\n", "Summe 114\n", "200901 7\n", "200709 7\n", "200710 7\n", "200711 7\n", " ... \n", "201201 7\n", "201202 7\n", "201203 7\n", "201204 7\n", "200012 7\n", "Name: count, Length: 241, dtype: int64" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
count
MONAT
Summe114
2009017
2007097
2007107
2007117
......
2012017
2012027
2012037
2012047
2000127
\n", "

241 rows × 1 columns

\n", "

" ] }, "metadata": {}, "execution_count": 96 } ] }, { "cell_type": "markdown", "source": [ "# Since MONAT includes values like '200703' i.e contains year+month values. We can probly remove the year since we already have that feature in 'JAHR' column. Ill just slice to extract the month values" ], "metadata": { "id": "1kjnie9KDgJw" } }, { "cell_type": "code", "source": [ "def convert_date(data, column_name='MONAT', special_value='Summe'):\n", "\n", " day_mapping = {\n", " '01': 'January',\n", " '02': 'February',\n", " '03': 'March',\n", " '04': 'April',\n", " '05': 'May',\n", " '06': 'June',\n", " '07': 'July',\n", " '08': 'August',\n", " '09': 'September',\n", " '10': 'October',\n", " '11': 'November',\n", " '12': 'December'\n", " }\n", "\n", " data_copy = data.copy()\n", " data_copy[column_name] = data_copy[column_name].apply(lambda x: day_mapping[x[4:]] if x != special_value else x)\n", "\n", " return data_copy\n", "\n", "df = convert_date(df, 'MONAT')" ], "metadata": { "id": "Jvv6y70NzDYB" }, "execution_count": 97, "outputs": [] }, { "cell_type": "code", "source": [ "df['MONAT'].unique()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "1zsJV0d5D96Y", "outputId": "c63ae632-16de-4eb3-ee2d-2a1c7ac4ecf6" }, "execution_count": 98, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array(['Summe', 'January', 'February', 'March', 'April', 'May', 'June',\n", " 'July', 'August', 'September', 'October', 'November', 'December'],\n", " dtype=object)" ] }, "metadata": {}, "execution_count": 98 } ] }, { "cell_type": "code", "source": [ "df['MONAT'].value_counts()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 523 }, "id": "EEWdkVDQBFG4", "outputId": "5c965b3c-fa8e-4e60-df43-dab476a49a11" }, "execution_count": 99, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "MONAT\n", "January 140\n", "February 140\n", "March 140\n", "April 140\n", "May 140\n", "June 140\n", "July 140\n", "August 140\n", "September 140\n", "October 140\n", "November 140\n", "December 140\n", "Summe 114\n", "Name: count, dtype: int64" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
count
MONAT
January140
February140
March140
April140
May140
June140
July140
August140
September140
October140
November140
December140
Summe114
\n", "

" ] }, "metadata": {}, "execution_count": 99 } ] }, { "cell_type": "markdown", "source": [ "# The distribution now seems better for MONAT column" ], "metadata": { "id": "PMO4-WmDEBUx" } }, { "cell_type": "code", "source": [ "plt.figure(figsize=(8,6))\n", "plt.hist(df['MONAT'], bins=13, edgecolor='black')\n", "plt.show()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 522 }, "id": "cg7JP1E-EC0M", "outputId": "463de13f-cedb-4718-d20d-fb2d49fb8535" }, "execution_count": 100, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "df.head(5)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "3dqIKmKhEFoF", "outputId": "899a11a4-b688-4f51-b006-2ed00d10b51d" }, "execution_count": 101, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " MONATSZAHL AUSPRAEGUNG JAHR MONAT WERT\n", "0 Alkoholunfälle insgesamt 2019 Summe 434.0\n", "1 Alkoholunfälle insgesamt 2019 January 22.0\n", "2 Alkoholunfälle insgesamt 2019 February 28.0\n", "3 Alkoholunfälle insgesamt 2019 March 34.0\n", "4 Alkoholunfälle insgesamt 2019 April 36.0" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MONATSZAHLAUSPRAEGUNGJAHRMONATWERT
0Alkoholunfälleinsgesamt2019Summe434.0
1Alkoholunfälleinsgesamt2019January22.0
2Alkoholunfälleinsgesamt2019February28.0
3Alkoholunfälleinsgesamt2019March34.0
4Alkoholunfälleinsgesamt2019April36.0
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df", "summary": "{\n \"name\": \"df\",\n \"rows\": 1794,\n \"fields\": [\n {\n \"column\": \"MONATSZAHL\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Alkoholunf\\u00e4lle\",\n \"Fluchtunf\\u00e4lle\",\n \"Verkehrsunf\\u00e4lle\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AUSPRAEGUNG\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"insgesamt\",\n \"Verletzte und Get\\u00f6tete\",\n \"mit Personensch\\u00e4den\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"JAHR\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 5,\n \"min\": 2000,\n \"max\": 2019,\n \"num_unique_values\": 20,\n \"samples\": [\n 2019,\n 2002,\n 2004\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"MONAT\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 13,\n \"samples\": [\n \"November\",\n \"September\",\n \"Summe\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WERT\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1674.8731821587605,\n \"min\": 0.0,\n \"max\": 11773.0,\n \"num_unique_values\": 884,\n \"samples\": [\n 56.0,\n 4209.0,\n 284.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 101 } ] }, { "cell_type": "markdown", "metadata": { "id": "dNRwBaKlyyBz" }, "source": [ "# I was confused between using label encoder or one hot encoder, but since the model performed better for one hot encoder, i decided to just go with it" ] }, { "cell_type": "code", "source": [ "columns_to_encode = df.columns[0:4]" ], "metadata": { "id": "tjvcKt7JcjUm" }, "execution_count": 102, "outputs": [] }, { "cell_type": "code", "source": [ "columns_to_encode" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "P7ctV8D1dIOe", "outputId": "86ace85a-f8d0-4ea2-d82f-16a00d2152cb" }, "execution_count": 105, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Index(['MONATSZAHL', 'AUSPRAEGUNG', 'JAHR', 'MONAT'], dtype='object')" ] }, "metadata": {}, "execution_count": 105 } ] }, { "cell_type": "code", "source": [ "def _one_hot(df):\n", " encoder = OneHotEncoder(sparse_output=False)\n", "\n", " one_hot_columns = df.columns[0:4]\n", "\n", " df_copy = df.copy()\n", "\n", " encoded_columns = encoder.fit_transform(df_copy[one_hot_columns])\n", "\n", " encoded_column_names = encoder.get_feature_names_out(one_hot_columns)\n", "\n", " encoded_df = pd.DataFrame(\n", " encoded_columns,\n", " columns=encoded_column_names,\n", " index=df_copy.index\n", " )\n", "\n", "\n", " final_df = pd.concat([\n", " df_copy.drop(columns=one_hot_columns),\n", " encoded_df\n", " ], axis=1)\n", "\n", " return final_df, encoder\n", "\n", "final_df, encoder = _one_hot(df)\n" ], "metadata": { "id": "h3rWO-QWY-Tn" }, "execution_count": 114, "outputs": [] }, { "cell_type": "code", "execution_count": 110, "metadata": { "id": "poFSghx08_ig", "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "outputId": "99da1f84-2ea1-4ff7-d458-0c77009c8c98" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "WERT 0\n", "MONATSZAHL_Alkoholunfälle 0\n", "MONATSZAHL_Fluchtunfälle 0\n", "MONATSZAHL_Verkehrsunfälle 0\n", "AUSPRAEGUNG_Verletzte und Getötete 0\n", "AUSPRAEGUNG_insgesamt 0\n", "AUSPRAEGUNG_mit Personenschäden 0\n", "JAHR_2000 0\n", "JAHR_2001 0\n", "JAHR_2002 0\n", "JAHR_2003 0\n", "JAHR_2004 0\n", "JAHR_2005 0\n", "JAHR_2006 0\n", "JAHR_2007 0\n", "JAHR_2008 0\n", "JAHR_2009 0\n", "JAHR_2010 0\n", "JAHR_2011 0\n", "JAHR_2012 0\n", "JAHR_2013 0\n", "JAHR_2014 0\n", "JAHR_2015 0\n", "JAHR_2016 0\n", "JAHR_2017 0\n", "JAHR_2018 0\n", "JAHR_2019 0\n", "MONAT_April 0\n", "MONAT_August 0\n", "MONAT_December 0\n", "MONAT_February 0\n", "MONAT_January 0\n", "MONAT_July 0\n", "MONAT_June 0\n", "MONAT_March 0\n", "MONAT_May 0\n", "MONAT_November 0\n", "MONAT_October 0\n", "MONAT_September 0\n", "MONAT_Summe 0\n", "dtype: int64" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0
WERT0
MONATSZAHL_Alkoholunfälle0
MONATSZAHL_Fluchtunfälle0
MONATSZAHL_Verkehrsunfälle0
AUSPRAEGUNG_Verletzte und Getötete0
AUSPRAEGUNG_insgesamt0
AUSPRAEGUNG_mit Personenschäden0
JAHR_20000
JAHR_20010
JAHR_20020
JAHR_20030
JAHR_20040
JAHR_20050
JAHR_20060
JAHR_20070
JAHR_20080
JAHR_20090
JAHR_20100
JAHR_20110
JAHR_20120
JAHR_20130
JAHR_20140
JAHR_20150
JAHR_20160
JAHR_20170
JAHR_20180
JAHR_20190
MONAT_April0
MONAT_August0
MONAT_December0
MONAT_February0
MONAT_January0
MONAT_July0
MONAT_June0
MONAT_March0
MONAT_May0
MONAT_November0
MONAT_October0
MONAT_September0
MONAT_Summe0
\n", "

" ] }, "metadata": {}, "execution_count": 110 } ], "source": [ "final_df.isna().sum()" ] }, { "cell_type": "markdown", "metadata": { "id": "d2i8SDsSyyBz" }, "source": [ "# 3. finally training the model and downloading it as pkl to use in api" ] }, { "cell_type": "code", "source": [ "x = final_df.drop(columns=['WERT'])\n", "y = final_df['WERT']\n", "\n", "x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10)\n" ], "metadata": { "id": "H5GezDyXB_In" }, "execution_count": 77, "outputs": [] }, { "cell_type": "code", "execution_count": 78, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 309 }, "id": "EVn2xhzhzQVa", "outputId": "5fbe46ae-20a7-439d-8f15-e396494dda1a" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Fitting 2 folds for each of 243 candidates, totalling 486 fits\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/xgboost/core.py:158: UserWarning: [15:56:52] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.\n", "\n", " E.g. tree_method = \"hist\", device = \"cuda\"\n", "\n", " warnings.warn(smsg, UserWarning)\n", "/usr/local/lib/python3.10/dist-packages/xgboost/core.py:158: UserWarning: [15:56:52] WARNING: /workspace/src/learner.cc:740: \n", "Parameters: { \"predictor\" } are not used.\n", "\n", " warnings.warn(smsg, UserWarning)\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "GridSearchCV(cv=2,\n", " estimator=XGBRegressor(base_score=None, booster=None,\n", " callbacks=None, colsample_bylevel=None,\n", " colsample_bynode=None,\n", " colsample_bytree=None, device=None,\n", " early_stopping_rounds=None,\n", " enable_categorical=False, eval_metric=None,\n", " feature_types=None, gamma=None,\n", " grow_policy=None, importance_type=None,\n", " interaction_constraints=None,\n", " learning_rate=None, m...\n", " min_child_weight=None, missing=nan,\n", " monotone_constraints=None,\n", " multi_strategy=None, n_estimators=None,\n", " n_jobs=None, num_parallel_tree=None,\n", " predictor='gpu_predictor', ...),\n", " n_jobs=-1,\n", " param_grid={'colsample_bytree': [0.6, 0.7, 0.8],\n", " 'learning_rate': [0.01, 0.05, 0.1],\n", " 'max_depth': [3, 5, 7],\n", " 'n_estimators': [100, 200, 300],\n", " 'subsample': [0.7, 0.8, 0.9]},\n", " scoring='neg_mean_squared_error', verbose=2)" ], "text/html": [ "
GridSearchCV(cv=2,\n",
              "             estimator=XGBRegressor(base_score=None, booster=None,\n",
              "                                    callbacks=None, colsample_bylevel=None,\n",
              "                                    colsample_bynode=None,\n",
              "                                    colsample_bytree=None, device=None,\n",
              "                                    early_stopping_rounds=None,\n",
              "                                    enable_categorical=False, eval_metric=None,\n",
              "                                    feature_types=None, gamma=None,\n",
              "                                    grow_policy=None, importance_type=None,\n",
              "                                    interaction_constraints=None,\n",
              "                                    learning_rate=None, m...\n",
              "                                    min_child_weight=None, missing=nan,\n",
              "                                    monotone_constraints=None,\n",
              "                                    multi_strategy=None, n_estimators=None,\n",
              "                                    n_jobs=None, num_parallel_tree=None,\n",
              "                                    predictor='gpu_predictor', ...),\n",
              "             n_jobs=-1,\n",
              "             param_grid={'colsample_bytree': [0.6, 0.7, 0.8],\n",
              "                         'learning_rate': [0.01, 0.05, 0.1],\n",
              "                         'max_depth': [3, 5, 7],\n",
              "                         'n_estimators': [100, 200, 300],\n",
              "                         'subsample': [0.7, 0.8, 0.9]},\n",
              "             scoring='neg_mean_squared_error', verbose=2)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ] }, "metadata": {}, "execution_count": 78 } ], "source": [ "xgb = XGBRegressor(\n", " tree_method='gpu_hist',\n", " predictor='gpu_predictor',\n", " verbosity=2\n", ")\n", "\n", "\n", "params = {\n", " 'n_estimators': [100, 200, 300],\n", " 'learning_rate': [0.01, 0.05, 0.1],\n", " 'max_depth': [3, 5, 7],\n", " 'subsample': [0.7, 0.8, 0.9],\n", " 'colsample_bytree': [0.6, 0.7, 0.8]\n", "}\n", "\n", "\n", "grid_search = GridSearchCV(\n", " estimator=xgb,\n", " param_grid=params,\n", " cv=2,\n", " scoring='neg_mean_squared_error',\n", " verbose=2,\n", " n_jobs=-1\n", ")\n", "\n", "grid_search.fit(x_train, y_train)" ] }, { "cell_type": "code", "source": [ "best_model = grid_search.best_estimator_\n", "y_pred = best_model.predict(x_test)\n", "\n", "mse = mean_squared_error(y_test, y_pred)\n", "print(\"Mean Squared Error on the test set: \", mse)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "G1aPfPxKIzYY", "outputId": "a8a91b18-7570-47d7-a483-881ddf7dd1d6" }, "execution_count": 79, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Mean Squared Error on the test set: 17201.635761885165\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/xgboost/core.py:158: UserWarning: [15:56:53] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.\n", "\n", " E.g. tree_method = \"hist\", device = \"cuda\"\n", "\n", " warnings.warn(smsg, UserWarning)\n", "/usr/local/lib/python3.10/dist-packages/xgboost/core.py:158: UserWarning: [15:56:53] WARNING: /workspace/src/common/error_msg.cc:58: Falling back to prediction using DMatrix due to mismatched devices. This might lead to higher memory usage and slower performance. XGBoost is running on: cuda:0, while the input data is on: cpu.\n", "Potential solutions:\n", "- Use a data structure that matches the device ordinal in the booster.\n", "- Set the device for booster before call to inplace_predict.\n", "\n", "This warning will only be shown once.\n", "\n", " warnings.warn(smsg, UserWarning)\n" ] } ] }, { "cell_type": "code", "source": [ "columns_to_encode" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "h23QhE7Gd3E4", "outputId": "12854042-465a-41ee-b49f-84e3375baab8" }, "execution_count": 112, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Index(['MONATSZAHL', 'AUSPRAEGUNG', 'JAHR', 'MONAT'], dtype='object')" ] }, "metadata": {}, "execution_count": 112 } ] }, { "cell_type": "code", "source": [ "def _inference(MONATSZAHL, AUSPRAEGUNG, JAHR, MONAT, encoder, columns_to_encode):\n", "\n", " temp_df = pd.DataFrame({\n", " 'MONATSZAHL': [MONATSZAHL],\n", " 'AUSPRAEGUNG': [AUSPRAEGUNG],\n", " 'JAHR': [JAHR],\n", " 'MONAT': [MONAT]\n", " })\n", "\n", " temp_df_copy = temp_df.copy()\n", "\n", " processed_df = convert_date(temp_df_copy)\n", "\n", " encoded_columns = encoder.transform(processed_df[columns_to_encode])\n", "\n", " encoded_column_names = encoder.get_feature_names_out(columns_to_encode)\n", "\n", " encoded_df = pd.DataFrame(encoded_columns, columns=encoded_column_names, index=processed_df.index)\n", "\n", " final_df = pd.concat([processed_df.drop(columns=columns_to_encode),encoded_df], axis=1)\n", "\n", " predictions = best_model.predict(final_df)\n", "\n", " return predictions\n", "\n", "result = _inference('Alkoholunfälle', 'Verletzte und Getötete', 2012, '201207', encoder, columns_to_encode)\n" ], "metadata": { "id": "ICDOC1WIX8As" }, "execution_count": 116, "outputs": [] }, { "cell_type": "code", "source": [ "result" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "u2ROrrC6fvo2", "outputId": "05bb5bb0-018a-4bb8-e06a-a53e383a9d58" }, "execution_count": 119, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([33.349915], dtype=float32)" ] }, "metadata": {}, "execution_count": 119 } ] }, { "cell_type": "markdown", "source": [ "# Storing the model & the encoder to be used in our api endpoint" ], "metadata": { "id": "zOa3pLT3a08b" } }, { "cell_type": "code", "execution_count": 118, "metadata": { "id": "EhSjmlIAOAGk" }, "outputs": [], "source": [ "with open('model.pkl', 'wb') as file:\n", " pickle.dump(best_model, file)\n", "\n", "with open('encoder.pkl', 'wb') as file:\n", " pickle.dump(encoder, file)" ] }, { "cell_type": "code", "source": [], "metadata": { "id": "JJsb2Rhve6CL" }, "execution_count": null, "outputs": [] } ], "metadata": { "accelerator": "GPU", "colab": { "gpuType": "T4", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.6" } }, "nbformat": 4, "nbformat_minor": 0 }