{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "4e64d318", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " dataset_id \\\n", "0 akjadhav/leandojo-lean4-formal-informal-strings \n", "1 aemska/stuhl \n", "2 Pogpotatofarmer/memes \n", "3 Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100h \n", "4 chamisfum/brain_tumor_3_classes \n", "\n", " dataset_url downloads author \\\n", "0 https://huggingface.co/datasets/akjadhav/leand... 22 None \n", "1 https://huggingface.co/datasets/aemska/stuhl 11 None \n", "2 https://huggingface.co/datasets/Pogpotatofarme... 15 None \n", "3 https://huggingface.co/datasets/Splend1dchan/N... 11 None \n", "4 https://huggingface.co/datasets/chamisfum/brai... 8 None \n", "\n", " license tags task_categories last_modified \\\n", "0 None None None 2024-01-30 07:40:02+00:00 \n", "1 openrail None None 2022-11-11 14:12:36+00:00 \n", "2 cc None None 2022-07-15 21:11:34+00:00 \n", "3 None None None None \n", "4 None None None None \n", "\n", " reason \\\n", "0 No metadata and no description \n", "1 Short description (char count=0, words=0) \n", "2 Short description (char count=0, words=0) \n", "3 Failed to load card \n", "4 Failed to load card \n", "\n", " readme_path word_count category \n", "0 dataset_readmes/akjadhav__leandojo-lean4-forma... 0 minimal \n", "1 dataset_readmes/aemska__stuhl_README.md 0 minimal \n", "2 dataset_readmes/Pogpotatofarmer__memes_README.md 0 minimal \n", "3 None 0 minimal \n", "4 None 0 minimal \n", " dataset_id \\\n", "0 autoevaluate/autoeval-staging-eval-launch__gov... \n", "1 autoevaluate/autoeval-eval-emotion-default-fe1... \n", "2 LTCB/enwik8 \n", "3 boltuix/emotions-dataset \n", "4 yixuantt/MultiHopRAG \n", "\n", " dataset_url downloads author \\\n", "0 https://huggingface.co/datasets/autoevaluate/a... 8 None \n", "1 https://huggingface.co/datasets/autoevaluate/a... 8 None \n", "2 https://huggingface.co/datasets/LTCB/enwik8 154 None \n", "3 https://huggingface.co/datasets/boltuix/emotio... 754 None \n", "4 https://huggingface.co/datasets/yixuantt/Multi... 7050 None \n", "\n", " license tags \\\n", "0 None autotrain, evaluation \n", "1 None autotrain, evaluation \n", "2 ['mit'] None \n", "3 mit emotions, nlp, sentiment-analysis, emotion-cla... \n", "4 odc-by None \n", "\n", " task_categories last_modified reason \\\n", "0 None 2022-09-09 07:44:04+00:00 None \n", "1 None 2022-09-16 20:22:59+00:00 None \n", "2 fill-mask, text-generation 2024-01-18 11:19:13+00:00 None \n", "3 None 2025-05-25 15:41:59+00:00 None \n", "4 question-answering, feature-extraction 2024-01-30 02:49:29+00:00 None \n", "\n", " readme_path word_count category \n", "0 dataset_readmes/autoevaluate__autoeval-staging... 55 rich \n", "1 dataset_readmes/autoevaluate__autoeval-eval-em... 57 rich \n", "2 dataset_readmes/LTCB__enwik8_README.md 427 rich \n", "3 dataset_readmes/boltuix__emotions-dataset_READ... 1643 rich \n", "4 dataset_readmes/yixuantt__MultiHopRAG_README.md 111 rich \n" ] } ], "source": [ "import pandas as pd\n", "\n", "# Read parquet files\n", "df1 = pd.read_parquet(\"/home/santosh/Repositories/personal/huggingface/dataset-insight-portal/all_minimal_dataset_cards.parquet\")\n", "df2 = pd.read_parquet(\"/home/santosh/Repositories/personal/huggingface/dataset-insight-portal/all_rich_dataset_cards.parquet\")\n", "\n", "# Display first few rows\n", "print(df1.head())\n", "print(df2.head())" ] }, { "cell_type": "code", "execution_count": 2, "id": "e9a20931", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dataset_iddataset_urldownloadsauthorlicensetagstask_categorieslast_modifiedreasonreadme_pathword_countcategory
0akjadhav/leandojo-lean4-formal-informal-stringshttps://huggingface.co/datasets/akjadhav/leand...22NoneNoneNoneNone2024-01-30 07:40:02+00:00No metadata and no descriptiondataset_readmes/akjadhav__leandojo-lean4-forma...0minimal
1aemska/stuhlhttps://huggingface.co/datasets/aemska/stuhl11NoneopenrailNoneNone2022-11-11 14:12:36+00:00Short description (char count=0, words=0)dataset_readmes/aemska__stuhl_README.md0minimal
2Pogpotatofarmer/memeshttps://huggingface.co/datasets/Pogpotatofarme...15NoneccNoneNone2022-07-15 21:11:34+00:00Short description (char count=0, words=0)dataset_readmes/Pogpotatofarmer__memes_README.md0minimal
3Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100hhttps://huggingface.co/datasets/Splend1dchan/N...11NoneNoneNoneNoneNoneFailed to load cardNone0minimal
4chamisfum/brain_tumor_3_classeshttps://huggingface.co/datasets/chamisfum/brai...8NoneNoneNoneNoneNoneFailed to load cardNone0minimal
.......................................
400292TAUR-dev/D-EVAL__standard_eval_v3__RC_BF_ab-bo...https://huggingface.co/datasets/TAUR-dev/D-EVA...0NoneNoneNoneNone2025-09-19 06:27:52+00:00Short description (char count=0, words=0)dataset_readmes/TAUR-dev__D-EVAL__standard_eva...0minimal
400293TAUR-dev/D-EVAL__standard_eval_v3__RC_BF_ab-bo...https://huggingface.co/datasets/TAUR-dev/D-EVA...0NoneNoneNoneNone2025-09-19 06:28:16+00:00Short description (char count=0, words=0)dataset_readmes/TAUR-dev__D-EVAL__standard_eva...0minimal
400294haru101/Minecraft-Knowledge-Datasethttps://huggingface.co/datasets/haru101/Minecr...0Noneapache-2.0Nonequestion-answering2025-09-19 06:33:33+00:00Short description (char count=0, words=0)dataset_readmes/haru101__Minecraft-Knowledge-D...0minimal
400295sxj1215/mmimdb_sorted_with_label_2https://huggingface.co/datasets/sxj1215/mmimdb...0NoneNoneNoneNone2025-09-19 06:35:25+00:00Short description (char count=0, words=0)dataset_readmes/sxj1215__mmimdb_sorted_with_la...0minimal
400296Vikir2411CS19/Multimodal_Complainthttps://huggingface.co/datasets/Vikir2411CS19/...0NoneNoneNoneNone2025-09-19 06:35:01+00:00Short description (char count=0, words=0)dataset_readmes/Vikir2411CS19__Multimodal_Comp...0minimal
\n", "

400297 rows × 12 columns

\n", "
" ], "text/plain": [ " dataset_id \\\n", "0 akjadhav/leandojo-lean4-formal-informal-strings \n", "1 aemska/stuhl \n", "2 Pogpotatofarmer/memes \n", "3 Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100h \n", "4 chamisfum/brain_tumor_3_classes \n", "... ... \n", "400292 TAUR-dev/D-EVAL__standard_eval_v3__RC_BF_ab-bo... \n", "400293 TAUR-dev/D-EVAL__standard_eval_v3__RC_BF_ab-bo... \n", "400294 haru101/Minecraft-Knowledge-Dataset \n", "400295 sxj1215/mmimdb_sorted_with_label_2 \n", "400296 Vikir2411CS19/Multimodal_Complaint \n", "\n", " dataset_url downloads author \\\n", "0 https://huggingface.co/datasets/akjadhav/leand... 22 None \n", "1 https://huggingface.co/datasets/aemska/stuhl 11 None \n", "2 https://huggingface.co/datasets/Pogpotatofarme... 15 None \n", "3 https://huggingface.co/datasets/Splend1dchan/N... 11 None \n", "4 https://huggingface.co/datasets/chamisfum/brai... 8 None \n", "... ... ... ... \n", "400292 https://huggingface.co/datasets/TAUR-dev/D-EVA... 0 None \n", "400293 https://huggingface.co/datasets/TAUR-dev/D-EVA... 0 None \n", "400294 https://huggingface.co/datasets/haru101/Minecr... 0 None \n", "400295 https://huggingface.co/datasets/sxj1215/mmimdb... 0 None \n", "400296 https://huggingface.co/datasets/Vikir2411CS19/... 0 None \n", "\n", " license tags task_categories last_modified \\\n", "0 None None None 2024-01-30 07:40:02+00:00 \n", "1 openrail None None 2022-11-11 14:12:36+00:00 \n", "2 cc None None 2022-07-15 21:11:34+00:00 \n", "3 None None None None \n", "4 None None None None \n", "... ... ... ... ... \n", "400292 None None None 2025-09-19 06:27:52+00:00 \n", "400293 None None None 2025-09-19 06:28:16+00:00 \n", "400294 apache-2.0 None question-answering 2025-09-19 06:33:33+00:00 \n", "400295 None None None 2025-09-19 06:35:25+00:00 \n", "400296 None None None 2025-09-19 06:35:01+00:00 \n", "\n", " reason \\\n", "0 No metadata and no description \n", "1 Short description (char count=0, words=0) \n", "2 Short description (char count=0, words=0) \n", "3 Failed to load card \n", "4 Failed to load card \n", "... ... \n", "400292 Short description (char count=0, words=0) \n", "400293 Short description (char count=0, words=0) \n", "400294 Short description (char count=0, words=0) \n", "400295 Short description (char count=0, words=0) \n", "400296 Short description (char count=0, words=0) \n", "\n", " readme_path word_count category \n", "0 dataset_readmes/akjadhav__leandojo-lean4-forma... 0 minimal \n", "1 dataset_readmes/aemska__stuhl_README.md 0 minimal \n", "2 dataset_readmes/Pogpotatofarmer__memes_README.md 0 minimal \n", "3 None 0 minimal \n", "4 None 0 minimal \n", "... ... ... ... \n", "400292 dataset_readmes/TAUR-dev__D-EVAL__standard_eva... 0 minimal \n", "400293 dataset_readmes/TAUR-dev__D-EVAL__standard_eva... 0 minimal \n", "400294 dataset_readmes/haru101__Minecraft-Knowledge-D... 0 minimal \n", "400295 dataset_readmes/sxj1215__mmimdb_sorted_with_la... 0 minimal \n", "400296 dataset_readmes/Vikir2411CS19__Multimodal_Comp... 0 minimal \n", "\n", "[400297 rows x 12 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1" ] }, { "cell_type": "code", "execution_count": 4, "id": "b5582c36", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idurlfieldkeywordmissing_readmemissing_card
0solomonk/reddit_mental_health_postshttps://huggingface.co/datasets/solomonk/reddi...life_scienceshealthFalseTrue
1Kira-Asimov/gender_clinical_trialhttps://huggingface.co/datasets/Kira-Asimov/ge...life_sciencesclinicalFalseTrue
2samhog/psychology-6khttps://huggingface.co/datasets/samhog/psychol...life_sciencespsychologyTrueTrue
3TCMLM/real_clinical_cases_of_Famous_Old_TCM_Do...https://huggingface.co/datasets/TCMLM/real_cli...life_sciencesclinicalFalseTrue
4jibrand/plant-dataset-JSONLhttps://huggingface.co/datasets/jibrand/plant-...agriculture_and_biologyplantTrueTrue
.....................
4035AshwinManohar/medicine_normalizer_alpacahttps://huggingface.co/datasets/AshwinManohar/...life_sciencesmedicineTrueTrue
4036AshwinManohar/medicine_parser_alpacahttps://huggingface.co/datasets/AshwinManohar/...life_sciencesmedicineTrueTrue
4037AshwinManohar/medicine_normalizer_alpaca_20khttps://huggingface.co/datasets/AshwinManohar/...life_sciencesmedicineTrueTrue
4038Adithyaaaa/plant_leaf_classificationhttps://huggingface.co/datasets/Adithyaaaa/pla...agriculture_and_biologyplantTrueTrue
4039benali-ai-24/drug-data-publichttps://huggingface.co/datasets/benali-ai-24/d...life_sciencesdrugTrueTrue
\n", "

4040 rows × 6 columns

\n", "
" ], "text/plain": [ " id \\\n", "0 solomonk/reddit_mental_health_posts \n", "1 Kira-Asimov/gender_clinical_trial \n", "2 samhog/psychology-6k \n", "3 TCMLM/real_clinical_cases_of_Famous_Old_TCM_Do... \n", "4 jibrand/plant-dataset-JSONL \n", "... ... \n", "4035 AshwinManohar/medicine_normalizer_alpaca \n", "4036 AshwinManohar/medicine_parser_alpaca \n", "4037 AshwinManohar/medicine_normalizer_alpaca_20k \n", "4038 Adithyaaaa/plant_leaf_classification \n", "4039 benali-ai-24/drug-data-public \n", "\n", " url \\\n", "0 https://huggingface.co/datasets/solomonk/reddi... \n", "1 https://huggingface.co/datasets/Kira-Asimov/ge... \n", "2 https://huggingface.co/datasets/samhog/psychol... \n", "3 https://huggingface.co/datasets/TCMLM/real_cli... \n", "4 https://huggingface.co/datasets/jibrand/plant-... \n", "... ... \n", "4035 https://huggingface.co/datasets/AshwinManohar/... \n", "4036 https://huggingface.co/datasets/AshwinManohar/... \n", "4037 https://huggingface.co/datasets/AshwinManohar/... \n", "4038 https://huggingface.co/datasets/Adithyaaaa/pla... \n", "4039 https://huggingface.co/datasets/benali-ai-24/d... \n", "\n", " field keyword missing_readme missing_card \n", "0 life_sciences health False True \n", "1 life_sciences clinical False True \n", "2 life_sciences psychology True True \n", "3 life_sciences clinical False True \n", "4 agriculture_and_biology plant True True \n", "... ... ... ... ... \n", "4035 life_sciences medicine True True \n", "4036 life_sciences medicine True True \n", "4037 life_sciences medicine True True \n", "4038 agriculture_and_biology plant True True \n", "4039 life_sciences drug True True \n", "\n", "[4040 rows x 6 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "csv_df = pd.read_csv(\"/home/santosh/Repositories/personal/huggingface/dataset-insight-portal/ds_missing_sci_data_4k.csv\")\n", "csv_df" ] }, { "cell_type": "code", "execution_count": 6, "id": "a061659a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dataset_iddataset_urldownloadsauthorlicensetagstask_categorieslast_modifiedreasonreadme_pathword_countcategory
0akjadhav/leandojo-lean4-formal-informal-stringshttps://huggingface.co/datasets/akjadhav/leand...22NoneNoneNoneNone2024-01-30 07:40:02+00:00No metadata and no descriptiondataset_readmes/akjadhav__leandojo-lean4-forma...0minimal
1aemska/stuhlhttps://huggingface.co/datasets/aemska/stuhl11NoneopenrailNoneNone2022-11-11 14:12:36+00:00Short description (char count=0, words=0)dataset_readmes/aemska__stuhl_README.md0minimal
2Pogpotatofarmer/memeshttps://huggingface.co/datasets/Pogpotatofarme...15NoneccNoneNone2022-07-15 21:11:34+00:00Short description (char count=0, words=0)dataset_readmes/Pogpotatofarmer__memes_README.md0minimal
3Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100hhttps://huggingface.co/datasets/Splend1dchan/N...11NoneNoneNoneNoneNoneFailed to load cardNone0minimal
4chamisfum/brain_tumor_3_classeshttps://huggingface.co/datasets/chamisfum/brai...8NoneNoneNoneNoneNoneFailed to load cardNone0minimal
.......................................
503185ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_14https://huggingface.co/datasets/ROBOTIS/ffw_bg...0Noneapache-2.0LeRobot, ffw_bg2_rev4_custom, robotisrobotics2025-09-19 06:28:15+00:00Nonedataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof...299rich
503186ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_15https://huggingface.co/datasets/ROBOTIS/ffw_bg...0Noneapache-2.0LeRobot, ffw_bg2_rev4_custom, robotisrobotics2025-09-19 06:29:40+00:00Nonedataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof...299rich
503187Dongkkka/ffw_bg2_rev4_custom_0919_5https://huggingface.co/datasets/Dongkkka/ffw_b...0Noneapache-2.0LeRobot, ffw_bg2_rev4_custom, robotisrobotics2025-09-19 06:30:53+00:00Nonedataset_readmes/Dongkkka__ffw_bg2_rev4_custom_...299rich
503188chenxing1234567890/eval_testZ1.2.1https://huggingface.co/datasets/chenxing123456...0Noneapache-2.0LeRobot, tutorialrobotics2025-09-19 06:34:11+00:00Nonedataset_readmes/chenxing1234567890__eval_testZ...231rich
503189Dongkkka/ffw_bg2_rev4_custom_0919_6https://huggingface.co/datasets/Dongkkka/ffw_b...0Noneapache-2.0LeRobot, ffw_bg2_rev4_custom, robotisrobotics2025-09-19 06:34:09+00:00Nonedataset_readmes/Dongkkka__ffw_bg2_rev4_custom_...299rich
\n", "

503190 rows × 12 columns

\n", "
" ], "text/plain": [ " dataset_id \\\n", "0 akjadhav/leandojo-lean4-formal-informal-strings \n", "1 aemska/stuhl \n", "2 Pogpotatofarmer/memes \n", "3 Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100h \n", "4 chamisfum/brain_tumor_3_classes \n", "... ... \n", "503185 ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_14 \n", "503186 ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_15 \n", "503187 Dongkkka/ffw_bg2_rev4_custom_0919_5 \n", "503188 chenxing1234567890/eval_testZ1.2.1 \n", "503189 Dongkkka/ffw_bg2_rev4_custom_0919_6 \n", "\n", " dataset_url downloads author \\\n", "0 https://huggingface.co/datasets/akjadhav/leand... 22 None \n", "1 https://huggingface.co/datasets/aemska/stuhl 11 None \n", "2 https://huggingface.co/datasets/Pogpotatofarme... 15 None \n", "3 https://huggingface.co/datasets/Splend1dchan/N... 11 None \n", "4 https://huggingface.co/datasets/chamisfum/brai... 8 None \n", "... ... ... ... \n", "503185 https://huggingface.co/datasets/ROBOTIS/ffw_bg... 0 None \n", "503186 https://huggingface.co/datasets/ROBOTIS/ffw_bg... 0 None \n", "503187 https://huggingface.co/datasets/Dongkkka/ffw_b... 0 None \n", "503188 https://huggingface.co/datasets/chenxing123456... 0 None \n", "503189 https://huggingface.co/datasets/Dongkkka/ffw_b... 0 None \n", "\n", " license tags task_categories \\\n", "0 None None None \n", "1 openrail None None \n", "2 cc None None \n", "3 None None None \n", "4 None None None \n", "... ... ... ... \n", "503185 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n", "503186 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n", "503187 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n", "503188 apache-2.0 LeRobot, tutorial robotics \n", "503189 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n", "\n", " last_modified reason \\\n", "0 2024-01-30 07:40:02+00:00 No metadata and no description \n", "1 2022-11-11 14:12:36+00:00 Short description (char count=0, words=0) \n", "2 2022-07-15 21:11:34+00:00 Short description (char count=0, words=0) \n", "3 None Failed to load card \n", "4 None Failed to load card \n", "... ... ... \n", "503185 2025-09-19 06:28:15+00:00 None \n", "503186 2025-09-19 06:29:40+00:00 None \n", "503187 2025-09-19 06:30:53+00:00 None \n", "503188 2025-09-19 06:34:11+00:00 None \n", "503189 2025-09-19 06:34:09+00:00 None \n", "\n", " readme_path word_count category \n", "0 dataset_readmes/akjadhav__leandojo-lean4-forma... 0 minimal \n", "1 dataset_readmes/aemska__stuhl_README.md 0 minimal \n", "2 dataset_readmes/Pogpotatofarmer__memes_README.md 0 minimal \n", "3 None 0 minimal \n", "4 None 0 minimal \n", "... ... ... ... \n", "503185 dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof... 299 rich \n", "503186 dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof... 299 rich \n", "503187 dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_... 299 rich \n", "503188 dataset_readmes/chenxing1234567890__eval_testZ... 231 rich \n", "503189 dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_... 299 rich \n", "\n", "[503190 rows x 12 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "merged_df = pd.concat([df1, df2], ignore_index=True)\n", "merged_df" ] }, { "cell_type": "code", "execution_count": 21, "id": "e0623157", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(11, 7)\n", " id \\\n", "623 introspector/unimath \n", "766 ekim15/bone_marrow_cell_dataset \n", "1645 fabriciojm/ecg-examples \n", "3280 ahork/record-test-6 \n", "3281 RickRain/SecondTrySimData3 \n", "\n", " url \\\n", "623 https://huggingface.co/datasets/introspector/u... \n", "766 https://huggingface.co/datasets/ekim15/bone_ma... \n", "1645 https://huggingface.co/datasets/fabriciojm/ecg... \n", "3280 https://huggingface.co/datasets/ahork/record-t... \n", "3281 https://huggingface.co/datasets/RickRain/Secon... \n", "\n", " field keyword missing_readme missing_card \\\n", "623 mathematics_and_statistics math False True \n", "766 life_sciences biology True False \n", "1645 life_sciences medical True False \n", "3280 engineering_and_technology robotics True False \n", "3281 engineering_and_technology robotics True False \n", "\n", " _id_lower \n", "623 introspector/unimath \n", "766 ekim15/bone_marrow_cell_dataset \n", "1645 fabriciojm/ecg-examples \n", "3280 ahork/record-test-6 \n", "3281 rickrain/secondtrysimdata3 \n" ] } ], "source": [ "# Create lowercase helper columns\n", "df1[\"_dataset_id_lower\"] = df1[\"dataset_id\"].str.lower()\n", "csv_df[\"_id_lower\"] = csv_df[\"id\"].str.lower()\n", "\n", "# Get the rows from df3 where id is NOT in df1\n", "df3_missed = csv_df[~csv_df[\"_id_lower\"].isin(df1[\"_dataset_id_lower\"])]\n", "\n", "print(df3_missed.shape)\n", "print(df3_missed.head())\n" ] }, { "cell_type": "code", "execution_count": 25, "id": "b6dbce79", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([['introspector/unimath',\n", " 'https://huggingface.co/datasets/introspector/unimath',\n", " 'mathematics_and_statistics', 'math', False, True,\n", " 'introspector/unimath'],\n", " ['ekim15/bone_marrow_cell_dataset',\n", " 'https://huggingface.co/datasets/ekim15/bone_marrow_cell_dataset',\n", " 'life_sciences', 'biology', True, False,\n", " 'ekim15/bone_marrow_cell_dataset'],\n", " ['fabriciojm/ecg-examples',\n", " 'https://huggingface.co/datasets/fabriciojm/ecg-examples',\n", " 'life_sciences', 'medical', True, False,\n", " 'fabriciojm/ecg-examples'],\n", " ['ahork/record-test-6',\n", " 'https://huggingface.co/datasets/ahork/record-test-6',\n", " 'engineering_and_technology', 'robotics', True, False,\n", " 'ahork/record-test-6'],\n", " ['RickRain/SecondTrySimData3',\n", " 'https://huggingface.co/datasets/RickRain/SecondTrySimData3',\n", " 'engineering_and_technology', 'robotics', True, False,\n", " 'rickrain/secondtrysimdata3'],\n", " ['MulixBF/record-cube-pick-2cam-black-2',\n", " 'https://huggingface.co/datasets/MulixBF/record-cube-pick-2cam-black-2',\n", " 'engineering_and_technology', 'robotics', True, False,\n", " 'mulixbf/record-cube-pick-2cam-black-2'],\n", " ['ricdigi/1two-camera3-test2345',\n", " 'https://huggingface.co/datasets/ricdigi/1two-camera3-test2345',\n", " 'engineering_and_technology', 'robotics', True, False,\n", " 'ricdigi/1two-camera3-test2345'],\n", " ['Ninkofu/sushi_put',\n", " 'https://huggingface.co/datasets/Ninkofu/sushi_put',\n", " 'engineering_and_technology', 'robotics', True, False,\n", " 'ninkofu/sushi_put'],\n", " ['jokla89/record-test-temp1',\n", " 'https://huggingface.co/datasets/jokla89/record-test-temp1',\n", " 'engineering_and_technology', 'robotics', True, False,\n", " 'jokla89/record-test-temp1'],\n", " ['LeRobot-worldwide-hackathon/325-casino-dealer-dice-set',\n", " 'https://huggingface.co/datasets/LeRobot-worldwide-hackathon/325-casino-dealer-dice-set',\n", " 'engineering_and_technology', 'robotics', True, False,\n", " 'lerobot-worldwide-hackathon/325-casino-dealer-dice-set'],\n", " ['jackvial/koch_screwdriver_attach_orange_panel_e125',\n", " 'https://huggingface.co/datasets/jackvial/koch_screwdriver_attach_orange_panel_e125',\n", " 'engineering_and_technology', 'robotics', True, False,\n", " 'jackvial/koch_screwdriver_attach_orange_panel_e125']],\n", " dtype=object)" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df3_missed.values" ] }, { "cell_type": "code", "execution_count": 26, "id": "0cec2023", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dataset_iddataset_urldownloadsauthorlicensetagstask_categorieslast_modifiedreasonreadme_pathword_countcategory_dataset_id_lower
0akjadhav/leandojo-lean4-formal-informal-stringshttps://huggingface.co/datasets/akjadhav/leand...22NoneNoneNoneNone2024-01-30 07:40:02+00:00No metadata and no descriptiondataset_readmes/akjadhav__leandojo-lean4-forma...0minimalakjadhav/leandojo-lean4-formal-informal-strings
1aemska/stuhlhttps://huggingface.co/datasets/aemska/stuhl11NoneopenrailNoneNone2022-11-11 14:12:36+00:00Short description (char count=0, words=0)dataset_readmes/aemska__stuhl_README.md0minimalaemska/stuhl
2Pogpotatofarmer/memeshttps://huggingface.co/datasets/Pogpotatofarme...15NoneccNoneNone2022-07-15 21:11:34+00:00Short description (char count=0, words=0)dataset_readmes/Pogpotatofarmer__memes_README.md0minimalpogpotatofarmer/memes
3Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100hhttps://huggingface.co/datasets/Splend1dchan/N...11NoneNoneNoneNoneNoneFailed to load cardNone0minimalsplend1dchan/nmsqa_sew-d-tiny-100k-ft-ls100h
4chamisfum/brain_tumor_3_classeshttps://huggingface.co/datasets/chamisfum/brai...8NoneNoneNoneNoneNoneFailed to load cardNone0minimalchamisfum/brain_tumor_3_classes
..........................................
503185ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_14https://huggingface.co/datasets/ROBOTIS/ffw_bg...0Noneapache-2.0LeRobot, ffw_bg2_rev4_custom, robotisrobotics2025-09-19 06:28:15+00:00Nonedataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof...299richrobotis/ffw_bg2_rev4_pick_coffee_bottle_env5_14
503186ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_15https://huggingface.co/datasets/ROBOTIS/ffw_bg...0Noneapache-2.0LeRobot, ffw_bg2_rev4_custom, robotisrobotics2025-09-19 06:29:40+00:00Nonedataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof...299richrobotis/ffw_bg2_rev4_pick_coffee_bottle_env5_15
503187Dongkkka/ffw_bg2_rev4_custom_0919_5https://huggingface.co/datasets/Dongkkka/ffw_b...0Noneapache-2.0LeRobot, ffw_bg2_rev4_custom, robotisrobotics2025-09-19 06:30:53+00:00Nonedataset_readmes/Dongkkka__ffw_bg2_rev4_custom_...299richdongkkka/ffw_bg2_rev4_custom_0919_5
503188chenxing1234567890/eval_testZ1.2.1https://huggingface.co/datasets/chenxing123456...0Noneapache-2.0LeRobot, tutorialrobotics2025-09-19 06:34:11+00:00Nonedataset_readmes/chenxing1234567890__eval_testZ...231richchenxing1234567890/eval_testz1.2.1
503189Dongkkka/ffw_bg2_rev4_custom_0919_6https://huggingface.co/datasets/Dongkkka/ffw_b...0Noneapache-2.0LeRobot, ffw_bg2_rev4_custom, robotisrobotics2025-09-19 06:34:09+00:00Nonedataset_readmes/Dongkkka__ffw_bg2_rev4_custom_...299richdongkkka/ffw_bg2_rev4_custom_0919_6
\n", "

503190 rows × 13 columns

\n", "
" ], "text/plain": [ " dataset_id \\\n", "0 akjadhav/leandojo-lean4-formal-informal-strings \n", "1 aemska/stuhl \n", "2 Pogpotatofarmer/memes \n", "3 Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100h \n", "4 chamisfum/brain_tumor_3_classes \n", "... ... \n", "503185 ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_14 \n", "503186 ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_15 \n", "503187 Dongkkka/ffw_bg2_rev4_custom_0919_5 \n", "503188 chenxing1234567890/eval_testZ1.2.1 \n", "503189 Dongkkka/ffw_bg2_rev4_custom_0919_6 \n", "\n", " dataset_url downloads author \\\n", "0 https://huggingface.co/datasets/akjadhav/leand... 22 None \n", "1 https://huggingface.co/datasets/aemska/stuhl 11 None \n", "2 https://huggingface.co/datasets/Pogpotatofarme... 15 None \n", "3 https://huggingface.co/datasets/Splend1dchan/N... 11 None \n", "4 https://huggingface.co/datasets/chamisfum/brai... 8 None \n", "... ... ... ... \n", "503185 https://huggingface.co/datasets/ROBOTIS/ffw_bg... 0 None \n", "503186 https://huggingface.co/datasets/ROBOTIS/ffw_bg... 0 None \n", "503187 https://huggingface.co/datasets/Dongkkka/ffw_b... 0 None \n", "503188 https://huggingface.co/datasets/chenxing123456... 0 None \n", "503189 https://huggingface.co/datasets/Dongkkka/ffw_b... 0 None \n", "\n", " license tags task_categories \\\n", "0 None None None \n", "1 openrail None None \n", "2 cc None None \n", "3 None None None \n", "4 None None None \n", "... ... ... ... \n", "503185 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n", "503186 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n", "503187 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n", "503188 apache-2.0 LeRobot, tutorial robotics \n", "503189 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n", "\n", " last_modified reason \\\n", "0 2024-01-30 07:40:02+00:00 No metadata and no description \n", "1 2022-11-11 14:12:36+00:00 Short description (char count=0, words=0) \n", "2 2022-07-15 21:11:34+00:00 Short description (char count=0, words=0) \n", "3 None Failed to load card \n", "4 None Failed to load card \n", "... ... ... \n", "503185 2025-09-19 06:28:15+00:00 None \n", "503186 2025-09-19 06:29:40+00:00 None \n", "503187 2025-09-19 06:30:53+00:00 None \n", "503188 2025-09-19 06:34:11+00:00 None \n", "503189 2025-09-19 06:34:09+00:00 None \n", "\n", " readme_path word_count \\\n", "0 dataset_readmes/akjadhav__leandojo-lean4-forma... 0 \n", "1 dataset_readmes/aemska__stuhl_README.md 0 \n", "2 dataset_readmes/Pogpotatofarmer__memes_README.md 0 \n", "3 None 0 \n", "4 None 0 \n", "... ... ... \n", "503185 dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof... 299 \n", "503186 dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof... 299 \n", "503187 dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_... 299 \n", "503188 dataset_readmes/chenxing1234567890__eval_testZ... 231 \n", "503189 dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_... 299 \n", "\n", " category _dataset_id_lower \n", "0 minimal akjadhav/leandojo-lean4-formal-informal-strings \n", "1 minimal aemska/stuhl \n", "2 minimal pogpotatofarmer/memes \n", "3 minimal splend1dchan/nmsqa_sew-d-tiny-100k-ft-ls100h \n", "4 minimal chamisfum/brain_tumor_3_classes \n", "... ... ... \n", "503185 rich robotis/ffw_bg2_rev4_pick_coffee_bottle_env5_14 \n", "503186 rich robotis/ffw_bg2_rev4_pick_coffee_bottle_env5_15 \n", "503187 rich dongkkka/ffw_bg2_rev4_custom_0919_5 \n", "503188 rich chenxing1234567890/eval_testz1.2.1 \n", "503189 rich dongkkka/ffw_bg2_rev4_custom_0919_6 \n", "\n", "[503190 rows x 13 columns]" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "merged_df" ] }, { "cell_type": "code", "execution_count": 27, "id": "2bc30fa7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(503190, 14)\n", " dataset_id \\\n", "0 akjadhav/leandojo-lean4-formal-informal-strings \n", "1 aemska/stuhl \n", "2 Pogpotatofarmer/memes \n", "3 Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100h \n", "4 chamisfum/brain_tumor_3_classes \n", "\n", " dataset_url downloads author \\\n", "0 https://huggingface.co/datasets/akjadhav/leand... 22 None \n", "1 https://huggingface.co/datasets/aemska/stuhl 11 None \n", "2 https://huggingface.co/datasets/Pogpotatofarme... 15 None \n", "3 https://huggingface.co/datasets/Splend1dchan/N... 11 None \n", "4 https://huggingface.co/datasets/chamisfum/brai... 8 None \n", "\n", " license tags task_categories last_modified \\\n", "0 None None None 2024-01-30 07:40:02+00:00 \n", "1 openrail None None 2022-11-11 14:12:36+00:00 \n", "2 cc None None 2022-07-15 21:11:34+00:00 \n", "3 None None None None \n", "4 None None None None \n", "\n", " reason \\\n", "0 No metadata and no description \n", "1 Short description (char count=0, words=0) \n", "2 Short description (char count=0, words=0) \n", "3 Failed to load card \n", "4 Failed to load card \n", "\n", " readme_path word_count category \\\n", "0 dataset_readmes/akjadhav__leandojo-lean4-forma... 0 minimal \n", "1 dataset_readmes/aemska__stuhl_README.md 0 minimal \n", "2 dataset_readmes/Pogpotatofarmer__memes_README.md 0 minimal \n", "3 None 0 minimal \n", "4 None 0 minimal \n", "\n", " field keyword \n", "0 NaN NaN \n", "1 NaN NaN \n", "2 NaN NaN \n", "3 NaN NaN \n", "4 life_sciences brain \n" ] } ], "source": [ "# Merge on lowercase columns to bring 'field' and 'keyword' from csv_df\n", "merged_df = merged_df.merge(\n", " csv_df[[\"_id_lower\", \"field\", \"keyword\"]],\n", " left_on=\"_dataset_id_lower\",\n", " right_on=\"_id_lower\",\n", " how=\"left\"\n", ")\n", "\n", "# Drop the helper columns\n", "merged_df = merged_df.drop(columns=[\"_dataset_id_lower\", \"_id_lower\"])\n", "\n", "# Quick check\n", "print(merged_df.shape)\n", "print(merged_df.head())\n" ] }, { "cell_type": "code", "execution_count": 28, "id": "4b104aef", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dataset_iddataset_urldownloadsauthorlicensetagstask_categorieslast_modifiedreasonreadme_pathword_countcategoryfieldkeyword
0akjadhav/leandojo-lean4-formal-informal-stringshttps://huggingface.co/datasets/akjadhav/leand...22NoneNoneNoneNone2024-01-30 07:40:02+00:00No metadata and no descriptiondataset_readmes/akjadhav__leandojo-lean4-forma...0minimalNaNNaN
1aemska/stuhlhttps://huggingface.co/datasets/aemska/stuhl11NoneopenrailNoneNone2022-11-11 14:12:36+00:00Short description (char count=0, words=0)dataset_readmes/aemska__stuhl_README.md0minimalNaNNaN
2Pogpotatofarmer/memeshttps://huggingface.co/datasets/Pogpotatofarme...15NoneccNoneNone2022-07-15 21:11:34+00:00Short description (char count=0, words=0)dataset_readmes/Pogpotatofarmer__memes_README.md0minimalNaNNaN
3Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100hhttps://huggingface.co/datasets/Splend1dchan/N...11NoneNoneNoneNoneNoneFailed to load cardNone0minimalNaNNaN
4chamisfum/brain_tumor_3_classeshttps://huggingface.co/datasets/chamisfum/brai...8NoneNoneNoneNoneNoneFailed to load cardNone0minimallife_sciencesbrain
.............................................
503185ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_14https://huggingface.co/datasets/ROBOTIS/ffw_bg...0Noneapache-2.0LeRobot, ffw_bg2_rev4_custom, robotisrobotics2025-09-19 06:28:15+00:00Nonedataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof...299richNaNNaN
503186ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_15https://huggingface.co/datasets/ROBOTIS/ffw_bg...0Noneapache-2.0LeRobot, ffw_bg2_rev4_custom, robotisrobotics2025-09-19 06:29:40+00:00Nonedataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof...299richNaNNaN
503187Dongkkka/ffw_bg2_rev4_custom_0919_5https://huggingface.co/datasets/Dongkkka/ffw_b...0Noneapache-2.0LeRobot, ffw_bg2_rev4_custom, robotisrobotics2025-09-19 06:30:53+00:00Nonedataset_readmes/Dongkkka__ffw_bg2_rev4_custom_...299richNaNNaN
503188chenxing1234567890/eval_testZ1.2.1https://huggingface.co/datasets/chenxing123456...0Noneapache-2.0LeRobot, tutorialrobotics2025-09-19 06:34:11+00:00Nonedataset_readmes/chenxing1234567890__eval_testZ...231richNaNNaN
503189Dongkkka/ffw_bg2_rev4_custom_0919_6https://huggingface.co/datasets/Dongkkka/ffw_b...0Noneapache-2.0LeRobot, ffw_bg2_rev4_custom, robotisrobotics2025-09-19 06:34:09+00:00Nonedataset_readmes/Dongkkka__ffw_bg2_rev4_custom_...299richNaNNaN
\n", "

503190 rows × 14 columns

\n", "
" ], "text/plain": [ " dataset_id \\\n", "0 akjadhav/leandojo-lean4-formal-informal-strings \n", "1 aemska/stuhl \n", "2 Pogpotatofarmer/memes \n", "3 Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100h \n", "4 chamisfum/brain_tumor_3_classes \n", "... ... \n", "503185 ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_14 \n", "503186 ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_15 \n", "503187 Dongkkka/ffw_bg2_rev4_custom_0919_5 \n", "503188 chenxing1234567890/eval_testZ1.2.1 \n", "503189 Dongkkka/ffw_bg2_rev4_custom_0919_6 \n", "\n", " dataset_url downloads author \\\n", "0 https://huggingface.co/datasets/akjadhav/leand... 22 None \n", "1 https://huggingface.co/datasets/aemska/stuhl 11 None \n", "2 https://huggingface.co/datasets/Pogpotatofarme... 15 None \n", "3 https://huggingface.co/datasets/Splend1dchan/N... 11 None \n", "4 https://huggingface.co/datasets/chamisfum/brai... 8 None \n", "... ... ... ... \n", "503185 https://huggingface.co/datasets/ROBOTIS/ffw_bg... 0 None \n", "503186 https://huggingface.co/datasets/ROBOTIS/ffw_bg... 0 None \n", "503187 https://huggingface.co/datasets/Dongkkka/ffw_b... 0 None \n", "503188 https://huggingface.co/datasets/chenxing123456... 0 None \n", "503189 https://huggingface.co/datasets/Dongkkka/ffw_b... 0 None \n", "\n", " license tags task_categories \\\n", "0 None None None \n", "1 openrail None None \n", "2 cc None None \n", "3 None None None \n", "4 None None None \n", "... ... ... ... \n", "503185 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n", "503186 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n", "503187 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n", "503188 apache-2.0 LeRobot, tutorial robotics \n", "503189 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n", "\n", " last_modified reason \\\n", "0 2024-01-30 07:40:02+00:00 No metadata and no description \n", "1 2022-11-11 14:12:36+00:00 Short description (char count=0, words=0) \n", "2 2022-07-15 21:11:34+00:00 Short description (char count=0, words=0) \n", "3 None Failed to load card \n", "4 None Failed to load card \n", "... ... ... \n", "503185 2025-09-19 06:28:15+00:00 None \n", "503186 2025-09-19 06:29:40+00:00 None \n", "503187 2025-09-19 06:30:53+00:00 None \n", "503188 2025-09-19 06:34:11+00:00 None \n", "503189 2025-09-19 06:34:09+00:00 None \n", "\n", " readme_path word_count \\\n", "0 dataset_readmes/akjadhav__leandojo-lean4-forma... 0 \n", "1 dataset_readmes/aemska__stuhl_README.md 0 \n", "2 dataset_readmes/Pogpotatofarmer__memes_README.md 0 \n", "3 None 0 \n", "4 None 0 \n", "... ... ... \n", "503185 dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof... 299 \n", "503186 dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof... 299 \n", "503187 dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_... 299 \n", "503188 dataset_readmes/chenxing1234567890__eval_testZ... 231 \n", "503189 dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_... 299 \n", "\n", " category field keyword \n", "0 minimal NaN NaN \n", "1 minimal NaN NaN \n", "2 minimal NaN NaN \n", "3 minimal NaN NaN \n", "4 minimal life_sciences brain \n", "... ... ... ... \n", "503185 rich NaN NaN \n", "503186 rich NaN NaN \n", "503187 rich NaN NaN \n", "503188 rich NaN NaN \n", "503189 rich NaN NaN \n", "\n", "[503190 rows x 14 columns]" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "merged_df" ] }, { "cell_type": "code", "execution_count": 30, "id": "69ec9289", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of rows with a value in 'science' column: 4040\n" ] } ], "source": [ "import numpy as np\n", "\n", "# Replace all None with np.nan\n", "merged_df = merged_df.replace({None: np.nan})\n", "\n", "# Count rows where 'science' column has a value (not NaN)\n", "science_count = merged_df[\"field\"].notna().sum()\n", "\n", "print(f\"Number of rows with a value in 'science' column: {science_count}\")\n" ] }, { "cell_type": "code", "execution_count": 31, "id": "b0d58ceb", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dataset_iddataset_urldownloadsauthorlicensetagstask_categorieslast_modifiedreasonreadme_pathword_countcategoryfieldkeyword
0akjadhav/leandojo-lean4-formal-informal-stringshttps://huggingface.co/datasets/akjadhav/leand...22NaNNaNNaNNaN2024-01-30 07:40:02+00:00No metadata and no descriptiondataset_readmes/akjadhav__leandojo-lean4-forma...0minimalNaNNaN
1aemska/stuhlhttps://huggingface.co/datasets/aemska/stuhl11NaNopenrailNaNNaN2022-11-11 14:12:36+00:00Short description (char count=0, words=0)dataset_readmes/aemska__stuhl_README.md0minimalNaNNaN
2Pogpotatofarmer/memeshttps://huggingface.co/datasets/Pogpotatofarme...15NaNccNaNNaN2022-07-15 21:11:34+00:00Short description (char count=0, words=0)dataset_readmes/Pogpotatofarmer__memes_README.md0minimalNaNNaN
3Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100hhttps://huggingface.co/datasets/Splend1dchan/N...11NaNNaNNaNNaNNaNFailed to load cardNaN0minimalNaNNaN
4chamisfum/brain_tumor_3_classeshttps://huggingface.co/datasets/chamisfum/brai...8NaNNaNNaNNaNNaNFailed to load cardNaN0minimallife_sciencesbrain
.............................................
503185ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_14https://huggingface.co/datasets/ROBOTIS/ffw_bg...0NaNapache-2.0LeRobot, ffw_bg2_rev4_custom, robotisrobotics2025-09-19 06:28:15+00:00NaNdataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof...299richNaNNaN
503186ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_15https://huggingface.co/datasets/ROBOTIS/ffw_bg...0NaNapache-2.0LeRobot, ffw_bg2_rev4_custom, robotisrobotics2025-09-19 06:29:40+00:00NaNdataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof...299richNaNNaN
503187Dongkkka/ffw_bg2_rev4_custom_0919_5https://huggingface.co/datasets/Dongkkka/ffw_b...0NaNapache-2.0LeRobot, ffw_bg2_rev4_custom, robotisrobotics2025-09-19 06:30:53+00:00NaNdataset_readmes/Dongkkka__ffw_bg2_rev4_custom_...299richNaNNaN
503188chenxing1234567890/eval_testZ1.2.1https://huggingface.co/datasets/chenxing123456...0NaNapache-2.0LeRobot, tutorialrobotics2025-09-19 06:34:11+00:00NaNdataset_readmes/chenxing1234567890__eval_testZ...231richNaNNaN
503189Dongkkka/ffw_bg2_rev4_custom_0919_6https://huggingface.co/datasets/Dongkkka/ffw_b...0NaNapache-2.0LeRobot, ffw_bg2_rev4_custom, robotisrobotics2025-09-19 06:34:09+00:00NaNdataset_readmes/Dongkkka__ffw_bg2_rev4_custom_...299richNaNNaN
\n", "

503190 rows × 14 columns

\n", "
" ], "text/plain": [ " dataset_id \\\n", "0 akjadhav/leandojo-lean4-formal-informal-strings \n", "1 aemska/stuhl \n", "2 Pogpotatofarmer/memes \n", "3 Splend1dchan/NMSQA_sew-d-tiny-100k-ft-ls100h \n", "4 chamisfum/brain_tumor_3_classes \n", "... ... \n", "503185 ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_14 \n", "503186 ROBOTIS/ffw_bg2_rev4_pick_coffee_bottle_env5_15 \n", "503187 Dongkkka/ffw_bg2_rev4_custom_0919_5 \n", "503188 chenxing1234567890/eval_testZ1.2.1 \n", "503189 Dongkkka/ffw_bg2_rev4_custom_0919_6 \n", "\n", " dataset_url downloads author \\\n", "0 https://huggingface.co/datasets/akjadhav/leand... 22 NaN \n", "1 https://huggingface.co/datasets/aemska/stuhl 11 NaN \n", "2 https://huggingface.co/datasets/Pogpotatofarme... 15 NaN \n", "3 https://huggingface.co/datasets/Splend1dchan/N... 11 NaN \n", "4 https://huggingface.co/datasets/chamisfum/brai... 8 NaN \n", "... ... ... ... \n", "503185 https://huggingface.co/datasets/ROBOTIS/ffw_bg... 0 NaN \n", "503186 https://huggingface.co/datasets/ROBOTIS/ffw_bg... 0 NaN \n", "503187 https://huggingface.co/datasets/Dongkkka/ffw_b... 0 NaN \n", "503188 https://huggingface.co/datasets/chenxing123456... 0 NaN \n", "503189 https://huggingface.co/datasets/Dongkkka/ffw_b... 0 NaN \n", "\n", " license tags task_categories \\\n", "0 NaN NaN NaN \n", "1 openrail NaN NaN \n", "2 cc NaN NaN \n", "3 NaN NaN NaN \n", "4 NaN NaN NaN \n", "... ... ... ... \n", "503185 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n", "503186 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n", "503187 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n", "503188 apache-2.0 LeRobot, tutorial robotics \n", "503189 apache-2.0 LeRobot, ffw_bg2_rev4_custom, robotis robotics \n", "\n", " last_modified reason \\\n", "0 2024-01-30 07:40:02+00:00 No metadata and no description \n", "1 2022-11-11 14:12:36+00:00 Short description (char count=0, words=0) \n", "2 2022-07-15 21:11:34+00:00 Short description (char count=0, words=0) \n", "3 NaN Failed to load card \n", "4 NaN Failed to load card \n", "... ... ... \n", "503185 2025-09-19 06:28:15+00:00 NaN \n", "503186 2025-09-19 06:29:40+00:00 NaN \n", "503187 2025-09-19 06:30:53+00:00 NaN \n", "503188 2025-09-19 06:34:11+00:00 NaN \n", "503189 2025-09-19 06:34:09+00:00 NaN \n", "\n", " readme_path word_count \\\n", "0 dataset_readmes/akjadhav__leandojo-lean4-forma... 0 \n", "1 dataset_readmes/aemska__stuhl_README.md 0 \n", "2 dataset_readmes/Pogpotatofarmer__memes_README.md 0 \n", "3 NaN 0 \n", "4 NaN 0 \n", "... ... ... \n", "503185 dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof... 299 \n", "503186 dataset_readmes/ROBOTIS__ffw_bg2_rev4_pick_cof... 299 \n", "503187 dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_... 299 \n", "503188 dataset_readmes/chenxing1234567890__eval_testZ... 231 \n", "503189 dataset_readmes/Dongkkka__ffw_bg2_rev4_custom_... 299 \n", "\n", " category field keyword \n", "0 minimal NaN NaN \n", "1 minimal NaN NaN \n", "2 minimal NaN NaN \n", "3 minimal NaN NaN \n", "4 minimal life_sciences brain \n", "... ... ... ... \n", "503185 rich NaN NaN \n", "503186 rich NaN NaN \n", "503187 rich NaN NaN \n", "503188 rich NaN NaN \n", "503189 rich NaN NaN \n", "\n", "[503190 rows x 14 columns]" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "merged_df" ] }, { "cell_type": "code", "execution_count": 32, "id": "d8d61dc6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "merged_df saved to 'datasetcards.parquet'\n" ] } ], "source": [ "# Save to parquet\n", "merged_df.to_parquet(\"datasetcards.parquet\", engine=\"pyarrow\", index=False)\n", "\n", "print(\"merged_df saved to 'datasetcards.parquet'\")\n" ] } ], "metadata": { "kernelspec": { "display_name": "hftest", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.18" } }, "nbformat": 4, "nbformat_minor": 5 }