{ "cells": [ { "cell_type": "markdown", "id": "b3cb2ca7-3a4c-4547-a838-5e93c9215094", "metadata": {}, "source": [ "Notebook which adds necessary columns to \"anupam_original.csv\" - \"relative_id\", \"point_group\" and \"crystal_system\" and saves the result as template.csv" ] }, { "cell_type": "code", "execution_count": 1, "id": "55f97550-8a91-4b3b-b0a9-af3c099d7fb4", "metadata": {}, "outputs": [], "source": [ "import json\n", "import pandas as pd\n", "import numpy as np\n", "\n", "import sys\n", "sys.path.append('..')\n", "from src.band_plotters import DATA_DIRECTORY" ] }, { "cell_type": "code", "execution_count": 2, "id": "a0eb5db1-79a3-4ecd-a57e-775353f0c819", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
formulagen_formulaspace_groupsegmentsflat_segmentsflatness_scorediscoverybinary_flatnesshorz_flat_segexfoliation_eg...ABCDEFradiof_orbsg_sto_grouppercentage_flat
ID
2dm-1IrF2AB2164300.095102bottom-up000.234620...FIrNaNNaNNaNNaNnon-radioactiveno-f-in-valenceNaNNaN
2dm-2Ba2SbAB2164310.387410bottom-up000.210650...BaSbNaNNaNNaNNaNnon-radioactiveno-f-in-valenceNaNNaN
2dm-3TlSAB2440.846460bottom-up130.095794...STlNaNNaNNaNNaNnon-radioactiveno-f-in-valence276.024.2
2dm-4MoCl2AB2166540.713760bottom-up00-0.055818...ClMoNaNNaNNaNNaNnon-radioactiveno-f-in-valenceNaNNaN
2dm-6RuI2AB2164310.264930bottom-up000.084831...IRuNaNNaNNaNNaNnon-radioactiveno-f-in-valenceNaNNaN
\n", "

5 rows × 24 columns

\n", "
" ], "text/plain": [ " formula gen_formula space_group segments flat_segments \\\n", "ID \n", "2dm-1 IrF2 AB2 164 3 0 \n", "2dm-2 Ba2Sb AB2 164 3 1 \n", "2dm-3 TlS AB 2 4 4 \n", "2dm-4 MoCl2 AB2 166 5 4 \n", "2dm-6 RuI2 AB2 164 3 1 \n", "\n", " flatness_score discovery binary_flatness horz_flat_seg \\\n", "ID \n", "2dm-1 0.095102 bottom-up 0 0 \n", "2dm-2 0.387410 bottom-up 0 0 \n", "2dm-3 0.846460 bottom-up 1 3 \n", "2dm-4 0.713760 bottom-up 0 0 \n", "2dm-6 0.264930 bottom-up 0 0 \n", "\n", " exfoliation_eg ... A B C D E F radio \\\n", "ID ... \n", "2dm-1 0.234620 ... F Ir NaN NaN NaN NaN non-radioactive \n", "2dm-2 0.210650 ... Ba Sb NaN NaN NaN NaN non-radioactive \n", "2dm-3 0.095794 ... S Tl NaN NaN NaN NaN non-radioactive \n", "2dm-4 -0.055818 ... Cl Mo NaN NaN NaN NaN non-radioactive \n", "2dm-6 0.084831 ... I Ru NaN NaN NaN NaN non-radioactive \n", "\n", " f_orb sg_sto_group percentage_flat \n", "ID \n", "2dm-1 no-f-in-valence NaN NaN \n", "2dm-2 no-f-in-valence NaN NaN \n", "2dm-3 no-f-in-valence 276.0 24.2 \n", "2dm-4 no-f-in-valence NaN NaN \n", "2dm-6 no-f-in-valence NaN NaN \n", "\n", "[5 rows x 24 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_anupam = pd.read_csv(\"../fingerprints/anupam_original.csv\", index_col=\"ID\")\n", "df_anupam.head()" ] }, { "cell_type": "code", "execution_count": 3, "id": "effeb8fd-4adc-4ace-af9b-750aae7693a6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['formula', 'gen_formula', 'space_group', 'segments', 'flat_segments',\n", " 'flatness_score', 'discovery', 'binary_flatness', 'horz_flat_seg',\n", " 'exfoliation_eg', 'decomp_eng', 'band_gap', 'point_group', 'crystal',\n", " 'A', 'B', 'C', 'D', 'E', 'F', 'radio', 'f_orb', 'sg_sto_group',\n", " 'percentage_flat'],\n", " dtype='object')" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_anupam.columns" ] }, { "cell_type": "code", "execution_count": 4, "id": "919fca45-3162-49d7-8d05-46b4fac59d92", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "dict_keys(['_id', 'sg_number', 'formula_pretty', 'nelements', 'creation_task_label', 'created_at', 'chemsys', 'material_id', 'discovery_process', 'formula_reduced_abc', 'sg_symbol', 'formula_anonymous', 'source_id', 'spacegroup', 'elements', '_tasksbuilder', 'structure', 'bandstructure', 'bandgap', 'thermo', 'energy_vdw_per_atom', 'magnetism', 'total_magnetization', 'calc_settings', 'energy_per_atom', 'relative_id', 'decomposition_energy', 'exfoliation_energy_per_atom'])\n" ] } ], "source": [ "with open(DATA_DIRECTORY/'db.json') as f:\n", " print(json.loads(next(f)).keys())" ] }, { "cell_type": "code", "execution_count": 5, "id": "bf2baa9c-ef61-446b-8709-6c400f22da8a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(161, 154)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = df_anupam.copy()\n", "relative_id = pd.Series(index=df.index, dtype=\"object\")\n", "crystal_system = pd.Series(index=df.index, dtype=\"object\")\n", "point_group = pd.Series(index=df.index, dtype=\"object\")\n", "\n", "\n", "no_discovery_process = []\n", "\n", "with open(DATA_DIRECTORY/'db.json') as f:\n", " for material in f:\n", " material=json.loads(material)\n", " mat_id = material[\"material_id\"]\n", " \n", " try:\n", " relative_id[mat_id] = material[\"relative_id\"]\n", " except:\n", " no_discovery_process.append(material[\"discovery_process\"])\n", "\n", " crystal_system[mat_id] = material[\"spacegroup\"][\"crystal_system\"]\n", " point_group[mat_id] = material[\"spacegroup\"][\"point_group\"]\n", " \n", "len(no_discovery_process), relative_id.isna().sum() # not equal because some materials in db.json aren't in Anupam's df" ] }, { "cell_type": "code", "execution_count": 7, "id": "d6b7a013-6e20-411f-a0d2-f4d69129b551", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
formulagen_formulaspace_groupsegmentsflat_segmentsflatness_scorediscoverybinary_flatnesshorz_flat_segexfoliation_eg...CDEFradiof_orbsg_sto_grouppercentage_flatrelative_idcrystal_system
ID
2dm-1IrF2AB2164300.095102bottom-up000.234620...NaNNaNNaNNaNnon-radioactiveno-f-in-valenceNaNNaN2dm-4963trigonal
2dm-2Ba2SbAB2164310.387410bottom-up000.210650...NaNNaNNaNNaNnon-radioactiveno-f-in-valenceNaNNaN2dm-3279trigonal
2dm-3TlSAB2440.846460bottom-up130.095794...NaNNaNNaNNaNnon-radioactiveno-f-in-valence276.024.22dm-5155triclinic
2dm-4MoCl2AB2166540.713760bottom-up00-0.055818...NaNNaNNaNNaNnon-radioactiveno-f-in-valenceNaNNaN2dm-4342trigonal
2dm-6RuI2AB2164310.264930bottom-up000.084831...NaNNaNNaNNaNnon-radioactiveno-f-in-valenceNaNNaN2dm-3574trigonal
\n", "

5 rows × 26 columns

\n", "
" ], "text/plain": [ " formula gen_formula space_group segments flat_segments \\\n", "ID \n", "2dm-1 IrF2 AB2 164 3 0 \n", "2dm-2 Ba2Sb AB2 164 3 1 \n", "2dm-3 TlS AB 2 4 4 \n", "2dm-4 MoCl2 AB2 166 5 4 \n", "2dm-6 RuI2 AB2 164 3 1 \n", "\n", " flatness_score discovery binary_flatness horz_flat_seg \\\n", "ID \n", "2dm-1 0.095102 bottom-up 0 0 \n", "2dm-2 0.387410 bottom-up 0 0 \n", "2dm-3 0.846460 bottom-up 1 3 \n", "2dm-4 0.713760 bottom-up 0 0 \n", "2dm-6 0.264930 bottom-up 0 0 \n", "\n", " exfoliation_eg ... C D E F radio \\\n", "ID ... \n", "2dm-1 0.234620 ... NaN NaN NaN NaN non-radioactive \n", "2dm-2 0.210650 ... NaN NaN NaN NaN non-radioactive \n", "2dm-3 0.095794 ... NaN NaN NaN NaN non-radioactive \n", "2dm-4 -0.055818 ... NaN NaN NaN NaN non-radioactive \n", "2dm-6 0.084831 ... NaN NaN NaN NaN non-radioactive \n", "\n", " f_orb sg_sto_group percentage_flat relative_id \\\n", "ID \n", "2dm-1 no-f-in-valence NaN NaN 2dm-4963 \n", "2dm-2 no-f-in-valence NaN NaN 2dm-3279 \n", "2dm-3 no-f-in-valence 276.0 24.2 2dm-5155 \n", "2dm-4 no-f-in-valence NaN NaN 2dm-4342 \n", "2dm-6 no-f-in-valence NaN NaN 2dm-3574 \n", "\n", " crystal_system \n", "ID \n", "2dm-1 trigonal \n", "2dm-2 trigonal \n", "2dm-3 triclinic \n", "2dm-4 trigonal \n", "2dm-6 trigonal \n", "\n", "[5 rows x 26 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"relative_id\"] = relative_id\n", "df[\"crystal_system\"] = crystal_system\n", "df[\"point_group\"] = point_group\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 8, "id": "bac85316-eb51-49ff-ab6f-196d678bf25e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(array(['-1', '-3', '-3m', '-4', '-42m', '-6', '-6m2', '1', '2', '2/m',\n", " '222', '3', '32', '3m', '4', '4/m', '4/mmm', '422', '4mm', '6/mmm',\n", " '6mm', 'm', 'mm2', 'mmm'], dtype=object),\n", " array(['hexagonal', 'monoclinic', 'orthorhombic', 'tetragonal',\n", " 'triclinic', 'trigonal'], dtype=object))" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.unique(df[\"point_group\"]), np.unique(df[\"crystal_system\"])" ] }, { "cell_type": "code", "execution_count": 9, "id": "07e335d6-1322-4117-a530-e900179cfeee", "metadata": {}, "outputs": [], "source": [ "df.to_csv(\"../fingerprints/template.csv\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.0" } }, "nbformat": 4, "nbformat_minor": 5 }