{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ISCO-08 hierarchical accuracy measure"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ISCO CSV file downloaded\n",
      "Weighted ISCO hierarchy dictionary created as isco_hierarchy\n",
      "\n",
      "The ISCO-08 Hierarchical Accuracy Measure is an implementation of the measure described in [Functional Annotation of Genes Using Hierarchical Text Categorization](https://www.researchgate.net/publication/44046343_Functional_Annotation_of_Genes_Using_Hierarchical_Text_Categorization) (Kiritchenko, Svetlana and Famili, Fazel. 2005) and adapted for the ISCO-08 classification scheme by the International Labour Organization.\n",
      "\n",
      "The measure rewards more precise classifications that correctly identify an occupation's placement down to the specific Unit group level and applies penalties for misclassifications based on the hierarchical distance between the correct and assigned categories.\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import evaluate\n",
    "\n",
    "ham = evaluate.load(\"/home/dux/workspace/1-IEA_RnD/isco_hierarchical_accuracy\")\n",
    "print(ham.description)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "References: ['1111', '1112', '1113', '1114', '1120']\n",
      "Predictions: ['1111', '1113', '1120', '1211', '2111']\n",
      "Accuracy: 0.2, Hierarchical Precision: 0.5, Hierarchical Recall: 0.7777777777777778, Hierarchical F-measure: 0.6086956521739131\n",
      "{'accuracy': 0.2, 'hierarchical_precision': 0.5, 'hierarchical_recall': 0.7777777777777778, 'hierarchical_fmeasure': 0.6086956521739131}\n"
     ]
    }
   ],
   "source": [
    "references = [\"1111\", \"1112\", \"1113\", \"1114\", \"1120\"]\n",
    "predictions = [\"1111\", \"1113\", \"1120\", \"1211\", \"2111\"]\n",
    "\n",
    "print(f\"References: {references}\")\n",
    "print(f\"Predictions: {predictions}\")\n",
    "print(ham.compute(references=references, predictions=predictions))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TEST CASE #1\n",
      "References: ['1111', '1111', '1111', '1111', '1111', '1111', '1111', '1111', '1111', '1111']\n",
      "Predictions: ['1111', '1112', '1120', '1211', '1311', '2111', '111', '11', '1', '9999']\n",
      "Accuracy: 0.1, Hierarchical Precision: 0.2222222222222222, Hierarchical Recall: 1.0, Hierarchical F-measure: 0.3636363636363636\n",
      "{'accuracy': 0.1, 'hierarchical_precision': 0.2222222222222222, 'hierarchical_recall': 1.0, 'hierarchical_fmeasure': 0.3636363636363636}\n",
      "\n",
      "TEST CASE #2\n",
      "References: ['1111']\n",
      "Predictions: ['1111']\n",
      "Accuracy: 1.0, Hierarchical Precision: 1.0, Hierarchical Recall: 1.0, Hierarchical F-measure: 1.0\n",
      "{'accuracy': 1.0, 'hierarchical_precision': 1.0, 'hierarchical_recall': 1.0, 'hierarchical_fmeasure': 1.0}\n",
      "\n",
      "TEST CASE #3\n",
      "References: ['1111']\n",
      "Predictions: ['1112']\n",
      "Accuracy: 0.0, Hierarchical Precision: 0.75, Hierarchical Recall: 0.75, Hierarchical F-measure: 0.75\n",
      "{'accuracy': 0.0, 'hierarchical_precision': 0.75, 'hierarchical_recall': 0.75, 'hierarchical_fmeasure': 0.75}\n",
      "\n",
      "TEST CASE #4\n",
      "References: ['1111']\n",
      "Predictions: ['1120']\n",
      "Accuracy: 0.0, Hierarchical Precision: 0.5, Hierarchical Recall: 0.5, Hierarchical F-measure: 0.5\n",
      "{'accuracy': 0.0, 'hierarchical_precision': 0.5, 'hierarchical_recall': 0.5, 'hierarchical_fmeasure': 0.5}\n",
      "\n",
      "TEST CASE #5\n",
      "References: ['1111']\n",
      "Predictions: ['1211']\n",
      "Accuracy: 0.0, Hierarchical Precision: 0.25, Hierarchical Recall: 0.25, Hierarchical F-measure: 0.25\n",
      "{'accuracy': 0.0, 'hierarchical_precision': 0.25, 'hierarchical_recall': 0.25, 'hierarchical_fmeasure': 0.25}\n",
      "\n",
      "TEST CASE #6\n",
      "References: ['1111']\n",
      "Predictions: ['1311']\n",
      "Accuracy: 0.0, Hierarchical Precision: 0.25, Hierarchical Recall: 0.25, Hierarchical F-measure: 0.25\n",
      "{'accuracy': 0.0, 'hierarchical_precision': 0.25, 'hierarchical_recall': 0.25, 'hierarchical_fmeasure': 0.25}\n",
      "\n",
      "TEST CASE #7\n",
      "References: ['1111']\n",
      "Predictions: ['2111']\n",
      "Accuracy: 0.0, Hierarchical Precision: 0.0, Hierarchical Recall: 0.0, Hierarchical F-measure: 0\n",
      "{'accuracy': 0.0, 'hierarchical_precision': 0.0, 'hierarchical_recall': 0.0, 'hierarchical_fmeasure': 0}\n",
      "\n",
      "TEST CASE #8\n",
      "References: ['1111']\n",
      "Predictions: ['111']\n",
      "Accuracy: 0.0, Hierarchical Precision: 1.0, Hierarchical Recall: 0.25, Hierarchical F-measure: 0.4\n",
      "{'accuracy': 0.0, 'hierarchical_precision': 1.0, 'hierarchical_recall': 0.25, 'hierarchical_fmeasure': 0.4}\n",
      "\n",
      "TEST CASE #9\n",
      "References: ['1111']\n",
      "Predictions: ['11']\n",
      "Accuracy: 0.0, Hierarchical Precision: 1.0, Hierarchical Recall: 0.25, Hierarchical F-measure: 0.4\n",
      "{'accuracy': 0.0, 'hierarchical_precision': 1.0, 'hierarchical_recall': 0.25, 'hierarchical_fmeasure': 0.4}\n",
      "\n",
      "TEST CASE #10\n",
      "References: ['1111']\n",
      "Predictions: ['1']\n",
      "Accuracy: 0.0, Hierarchical Precision: 1.0, Hierarchical Recall: 0.25, Hierarchical F-measure: 0.4\n",
      "{'accuracy': 0.0, 'hierarchical_precision': 1.0, 'hierarchical_recall': 0.25, 'hierarchical_fmeasure': 0.4}\n",
      "\n",
      "TEST CASE #11\n",
      "References: ['1111']\n",
      "Predictions: ['9999']\n",
      "Accuracy: 0.0, Hierarchical Precision: 0.0, Hierarchical Recall: 0.0, Hierarchical F-measure: 0\n",
      "{'accuracy': 0.0, 'hierarchical_precision': 0.0, 'hierarchical_recall': 0.0, 'hierarchical_fmeasure': 0}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Compute all test cases and print the results\n",
    "from tests import test_cases\n",
    "\n",
    "test_number = 1\n",
    "\n",
    "for test_case in test_cases:\n",
    "    references = test_case[\"references\"]\n",
    "    predictions = test_case[\"predictions\"]\n",
    "    print(f\"TEST CASE #{test_number}\")\n",
    "    print(f\"References: {references}\")\n",
    "    print(f\"Predictions: {predictions}\")\n",
    "    print(ham.compute(references=references, predictions=predictions))\n",
    "    print()\n",
    "    test_number += 1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Model evaluation using the test split of the dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset, get_dataset_config_names, get_dataset_infos, get_dataset_split_names\n",
    "\n",
    "dataset = load_dataset(\"ICILS/multilingual_parental_occupations\", \"ilo\")\n",
    "dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4634a4a344384ef28d182adeea1f5afc",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading builder script:   0%|          | 0.00/13.4k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ISCO CSV file downloaded\n",
      "Weighted ISCO hierarchy dictionary created as isco_hierarchy\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "from datasets import load_dataset\n",
    "from transformers import pipeline\n",
    "import evaluate\n",
    "import json\n",
    "\n",
    "# Ensure that the HF_TOKEN environment variable is set\n",
    "hf_token = os.getenv(\"HF_TOKEN\")\n",
    "if hf_token is None:\n",
    "    raise ValueError(\"HF_TOKEN environment variable is not set.\")\n",
    "\n",
    "test_split = load_dataset(\"ICILS/multilingual_parental_occupations\", \"icils\", split=\"test\", token=hf_token)\n",
    "validation_split = load_dataset(\"ICILS/multilingual_parental_occupations\", \"icils\", split=\"validation\", token=hf_token)\n",
    "\n",
    "# Load the dataset\n",
    "test_data_subset = (\n",
    "   test_split.shuffle(seed=42).select(range(100))\n",
    ")\n",
    "\n",
    "# Initialize the pipeline\n",
    "model = \"danieldux/XLM-R-ISCO-v2\" # ICILS/XLM-R-ISCO\n",
    "pipe = pipeline(\"text-classification\", model=model, token=hf_token)\n",
    "\n",
    "# Initialize the hierarchical accuracy measure\n",
    "hierarchical_accuracy = evaluate.load(\"danieldux/isco_hierarchical_accuracy\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['IDSTUD', 'JOB_DUTIES', 'ISCO', 'ISCO_REL', 'ISCO_TITLE', 'ISCO_CODE_TITLE', 'COUNTRY', 'LANGUAGE'],\n",
       "    num_rows: 100\n",
       "})"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_data_subset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Test set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2024-03-31--01:29\n",
      "Evaluation results saved to test_split_results-2024-03-31--01:29.json\n"
     ]
    }
   ],
   "source": [
    "import datetime\n",
    "\n",
    "stamp = datetime.datetime.now().strftime(\"%Y-%m-%d--%H:%M\")\n",
    "print(stamp)\n",
    "\n",
    "# Evaluate the model\n",
    "predictions = []\n",
    "references = []\n",
    "for example in test_data_subset:\n",
    "\n",
    "    # Predict\n",
    "    prediction = pipe(\n",
    "        example[\"JOB_DUTIES\"]\n",
    "    )  # Use the key \"JOB_DUTIES\" for the text data\n",
    "    # predicted_label = extract_isco_code(prediction[0][\"label\"])\n",
    "    predicted_label = prediction[0][\"label\"]\n",
    "    predictions.append(predicted_label)\n",
    "\n",
    "    # Reference\n",
    "    reference_label = example[\"ISCO\"]  # Use the key \"ISCO\" for the ISCO code\n",
    "    references.append(reference_label)\n",
    "\n",
    "# Compute the hierarchical accuracy\n",
    "test_results = hierarchical_accuracy.compute(predictions=predictions, references=references)\n",
    "\n",
    "# Save the results to a JSON file\n",
    "with open(f\"test_split_results-{stamp}.json\", \"w\") as f:\n",
    "    json.dump(test_results, f)\n",
    "\n",
    "print(f\"Evaluation results saved to test_split_results-{stamp}.json\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'accuracy': 0.82,\n",
       " 'hierarchical_precision': 0.9090909090909091,\n",
       " 'hierarchical_recall': 0.8839779005524862,\n",
       " 'hierarchical_fmeasure': 0.8963585434173669}"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.8523316062176166, Hierarchical Precision: 0.9711751662971175, Hierarchical Recall: 0.9733333333333334, Hierarchical F-measure: 0.9722530521642619\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_376175/1380879571.py:30: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
      "  results_df = pd.concat(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.8549323017408124, Hierarchical Precision: 0.9425981873111783, Hierarchical Recall: 0.96, Hierarchical F-measure: 0.9512195121951218\n",
      "Accuracy: 0.817351598173516, Hierarchical Precision: 0.9076305220883534, Hierarchical Recall: 0.9377593360995851, Hierarchical F-measure: 0.9224489795918367\n",
      "Accuracy: 0.8160919540229885, Hierarchical Precision: 0.9140893470790378, Hierarchical Recall: 0.9204152249134948, Hierarchical F-measure: 0.9172413793103448\n",
      "Accuracy: 0.7801724137931034, Hierarchical Precision: 0.8776978417266187, Hierarchical Recall: 0.9207547169811321, Hierarchical F-measure: 0.8987108655616942\n",
      "Accuracy: 0.8200836820083682, Hierarchical Precision: 0.9007352941176471, Hierarchical Recall: 0.9176029962546817, Hierarchical F-measure: 0.9090909090909092\n",
      "Accuracy: 0.5149253731343284, Hierarchical Precision: 0.7487684729064039, Hierarchical Recall: 0.8, Hierarchical F-measure: 0.7735368956743003\n",
      "Accuracy: 0.9, Hierarchical Precision: 0.9244444444444444, Hierarchical Recall: 0.9285714285714286, Hierarchical F-measure: 0.9265033407572383\n",
      "Accuracy: 0.9030612244897959, Hierarchical Precision: 0.9509803921568627, Hierarchical Recall: 0.9603960396039604, Hierarchical F-measure: 0.9556650246305418\n",
      "Accuracy: 0.7836538461538461, Hierarchical Precision: 0.9047619047619048, Hierarchical Recall: 0.8916967509025271, Hierarchical F-measure: 0.8981818181818182\n",
      "Accuracy: 0.8707865168539326, Hierarchical Precision: 0.9269406392694064, Hierarchical Recall: 0.9441860465116279, Hierarchical F-measure: 0.9354838709677419\n",
      "Accuracy: 0.9230769230769231, Hierarchical Precision: 0.9, Hierarchical Recall: 0.9473684210526315, Hierarchical F-measure: 0.9230769230769231\n",
      "   Language  Accuracy  Hierarchical Precision  Hierarchical Recall  \\\n",
      "0        sv  0.923077                0.900000             0.947368   \n",
      "1        ko  0.870787                0.926941             0.944186   \n",
      "2        pt  0.783654                0.904762             0.891697   \n",
      "3        kk  0.903061                0.950980             0.960396   \n",
      "4        ru  0.900000                0.924444             0.928571   \n",
      "5        de  0.514925                0.748768             0.800000   \n",
      "6        fi  0.820084                0.900735             0.917603   \n",
      "7        da  0.780172                0.877698             0.920755   \n",
      "8        fr  0.816092                0.914089             0.920415   \n",
      "9        it  0.817352                0.907631             0.937759   \n",
      "10       es  0.854932                0.942598             0.960000   \n",
      "11       en  0.852332                0.971175             0.973333   \n",
      "\n",
      "    Hierarchical F1  \n",
      "0          0.923077  \n",
      "1          0.935484  \n",
      "2          0.898182  \n",
      "3          0.955665  \n",
      "4          0.926503  \n",
      "5          0.773537  \n",
      "6          0.909091  \n",
      "7          0.898711  \n",
      "8          0.917241  \n",
      "9          0.922449  \n",
      "10         0.951220  \n",
      "11         0.972253  \n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "test_data_df = test_data.to_pandas()\n",
    "results_df = pd.DataFrame(columns=['Language', 'Accuracy', 'Hierarchical Precision', 'Hierarchical Recall', 'Hierarchical F1'])\n",
    "\n",
    "# Iterate over unique languages\n",
    "for language in test_data_df[\"LANGUAGE\"].unique():\n",
    "    # Filter test data for the current language\n",
    "    test_data_subset = test_data_df[test_data_df[\"LANGUAGE\"] == language]\n",
    "\n",
    "    # Evaluate the model for the current language\n",
    "    predictions = []\n",
    "    references = []\n",
    "    for example in test_data_subset.to_dict(\"records\"):\n",
    "        # Predict\n",
    "        prediction = pipe(example[\"JOB_DUTIES\"])\n",
    "        predicted_label = extract_isco_code(prediction[0][\"label\"])\n",
    "        predictions.append(predicted_label)\n",
    "\n",
    "        # Reference\n",
    "        reference_label = example[\"ISCO\"]\n",
    "        references.append(reference_label)\n",
    "\n",
    "    # Compute the hierarchical accuracy for the current language\n",
    "    test_results = hierarchical_accuracy.compute(\n",
    "        predictions=predictions, references=references\n",
    "    )\n",
    "\n",
    "    # Save the results to a JSON file\n",
    "    results_df = pd.concat(\n",
    "        [\n",
    "            pd.DataFrame(\n",
    "                {\n",
    "                    \"Language\": [language],\n",
    "                    \"Accuracy\": [test_results[\"accuracy\"]],\n",
    "                    \"Hierarchical Precision\": [test_results[\"hierarchical_precision\"]],\n",
    "                    \"Hierarchical Recall\": [test_results[\"hierarchical_recall\"]],\n",
    "                    \"Hierarchical F1\": [test_results[\"hierarchical_fmeasure\"]],\n",
    "                }\n",
    "            ),\n",
    "            results_df,\n",
    "        ],\n",
    "        ignore_index=True\n",
    "    )\n",
    "\n",
    "# Print the evaluation results\n",
    "print(results_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "results_df.to_csv('model_language_results.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Validation set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.8576800694243564, Hierarchical Precision: 0.9757462686567164, Hierarchical Recall: 0.9812382739212008, Hierarchical F-measure: 0.9784845650140319\n",
      "Evaluation results saved to isco_validation_results.json\n"
     ]
    }
   ],
   "source": [
    "# Evaluate the model\n",
    "predictions = []\n",
    "references = []\n",
    "for example in validation_data:\n",
    "\n",
    "    # Predict\n",
    "    prediction = pipe(\n",
    "        example[\"JOB_DUTIES\"]\n",
    "    )  # Use the key \"JOB_DUTIES\" for the text data\n",
    "    predicted_label = extract_isco_code(prediction[0][\"label\"])\n",
    "    predictions.append(predicted_label)\n",
    "\n",
    "    # Reference\n",
    "    reference_label = example[\"ISCO\"]  # Use the key \"ISCO\" for the ISCO code\n",
    "    references.append(reference_label)\n",
    "\n",
    "# Compute the hierarchical accuracy\n",
    "validation_results = hierarchical_accuracy.compute(predictions=predictions, references=references)\n",
    "\n",
    "# Save the results to a JSON file\n",
    "with open(\"isco_validation_results.json\", \"w\") as f:\n",
    "    json.dump(validation_results, f)\n",
    "\n",
    "print(\"Evaluation results saved to isco_validation_results.json\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Inter rater agreement"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## All ICILS 2018 data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# icils_isco_int_ml = \"/datasets/isco-data/processed/2018/icils_2018_isco_ml.parquet\"\n",
    "icils_isco_int_ml = \"gs://isco-data-asia-southeast1/processed/2018/icils_2018_isco_ml.parquet\"\n",
    "\n",
    "icils_df = pd.read_parquet(icils_isco_int_ml)[['JOB', 'DUTIES', 'ISCO', 'ISCO_REL', 'LANGUAGE']]\n",
    "\n",
    "# Create a new pandas dataframe with samples that have ISCO_REL values\n",
    "isco_rel_df = icils_df[icils_df['ISCO'].notna()].copy()\n",
    "\n",
    "# remove rows with None values in ISCO_REL\n",
    "isco_rel_df = isco_rel_df[isco_rel_df['ISCO_REL'].notna()]\n",
    "\n",
    "# Group the DataFrame by LANGUAGE column\n",
    "grouped_df = isco_rel_df.groupby('LANGUAGE')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### By language"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "results_df = pd.DataFrame(columns=['Language', 'Accuracy', 'Hierarchical Precision', 'Hierarchical Recall', 'Hierarchical F1'])\n",
    "\n",
    "# Iterate over each group\n",
    "for language, group in grouped_df:\n",
    "    references = group['ISCO'].tolist()\n",
    "    predictions = group['ISCO_REL'].tolist()\n",
    "    \n",
    "    # Apply the compute function\n",
    "    rel_result = hierarchical_accuracy.compute(references=references, predictions=predictions)\n",
    "    \n",
    "    # Create a new DataFrame with the result for the current group\n",
    "    group_result_df = pd.DataFrame({'Language': [language], 'Accuracy': [rel_result['accuracy']], 'Hierarchical Precision': [rel_result['hierarchical_precision']], 'Hierarchical Recall': [rel_result['hierarchical_recall']], 'Hierarchical F1': [rel_result['hierarchical_fmeasure']]})\n",
    "    \n",
    "    # Concatenate the group_result_df with the results_df\n",
    "    results_df = pd.concat([results_df, group_result_df], ignore_index=True)\n",
    "    \n",
    "    # Print the result\n",
    "    print(f\"Language: {language}\")\n",
    "    # print(f\"References: {references}\")\n",
    "    # print(f\"Predictions: {predictions}\")\n",
    "    print(f\"Result: {rel_result}\")\n",
    "    print()\n",
    "\n",
    "average_accuracy = results_df['Accuracy'].mean()\n",
    "average_hierarchical_precision = results_df['Hierarchical Precision'].mean()\n",
    "average_hierarchical_recall = results_df['Hierarchical Recall'].mean()\n",
    "average_hierarchical_f1 = results_df['Hierarchical F1'].mean()\n",
    "\n",
    "average_row = ['Average', average_accuracy, average_hierarchical_precision, average_hierarchical_recall, average_hierarchical_f1]\n",
    "results_df.loc[len(results_df)] = average_row\n",
    "\n",
    "\n",
    "results_df.to_csv('inter-rater_language_results.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Training data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "test_data_df = test_data.to_pandas()\n",
    "unknown_reliability_samples = test_data_df[test_data_df['ISCO_REL'].isna() | test_data_df['ISCO_REL'].isin([\"9998\", \"9999\"])]\n",
    "\n",
    "# Exclude unknown reliability samples from test_data_df\n",
    "test_split_rel_df = test_data_df[~test_data_df['ISCO_REL'].isna() & ~test_data_df['ISCO_REL'].isin([\"9998\", \"9999\"])]\n",
    "\n",
    "# Group the DataFrame by LANGUAGE column\n",
    "test_split_rel_grouped_df = test_split_rel_df.groupby('LANGUAGE')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Validation data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Test data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create a dataframe with samples where ISCO and ISCO_REL the same\n",
    "isco_rel_df_same = isco_rel_df[isco_rel_df['ISCO'] == isco_rel_df['ISCO_REL']]\n",
    "\n",
    "isco_rel_df_same"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create a dataframe with samples where ISCO and ISCO_REL are different\n",
    "isco_rel_df_diff = isco_rel_df[isco_rel_df['ISCO'] != isco_rel_df['ISCO_REL']]\n",
    "\n",
    "isco_rel_df_diff"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Make a list of all values in ISCO and ISCO_REL columns\n",
    "coder1 = list(isco_rel_df['ISCO'])\n",
    "coder2 = list(isco_rel_df['ISCO_REL'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Compute the hierarchical accuracy\n",
    "reliability_results = hierarchical_accuracy.compute(predictions=coder2, references=coder1)\n",
    "\n",
    "# Save the results to a JSON file\n",
    "with open(\"isco_rel_results.json\", \"w\") as f:\n",
    "    json.dump(reliability_results, f)\n",
    "\n",
    "print(\"Evaluation results saved to isco_rel_results.json\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "autogenstudio",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}