{ "cells": [ { "cell_type": "markdown", "id": "818b53f6-ce03-41ae-8318-ab53be1d8916", "metadata": {}, "source": [ "# Conversion of the Latest Dataframe to Parquet\n", "\n", "We need to store our dataset in a warehouse so we use parquet" ] }, { "cell_type": "code", "execution_count": 11, "id": "1c4a0c5a-ea47-4a91-a0de-de46b70fe9b0", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import pyarrow as pa\n", "import pyarrow.parquet as pq\n", "\n", "# Load the Pandas DataFrame\n", "df = pd.read_csv('dialogues.csv', sep='\\t', encoding='utf-8')" ] }, { "cell_type": "code", "execution_count": 12, "id": "d802024b-461f-4853-9e7c-229581a11836", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DataFrame saved to Parquet file: ./data/parquet/dialogues.parquet\n" ] } ], "source": [ "# Convert Pandas DataFrame to Arrow Table\n", "table = pa.Table.from_pandas(df)\n", "# Specify the output file path for the Parquet file\n", "parquet_file_path = './data/parquet/dialogues.parquet'\n", "\n", "# Write the Arrow Table to a Parquet file\n", "pq.write_table(table, parquet_file_path)\n", "\n", "print(f'DataFrame saved to Parquet file: {parquet_file_path}')" ] }, { "cell_type": "code", "execution_count": 13, "id": "78704d3a-a812-4f20-8103-357575211b1e", "metadata": {}, "outputs": [], "source": [ "# Read Parquet file into Arrow Table\n", "table = pq.read_table(parquet_file_path)\n", "\n", "# Convert Arrow Table to Pandas DataFrame\n", "df = table.to_pandas()\n" ] }, { "cell_type": "code", "execution_count": 15, "id": "87fb4487-e633-497d-b490-f39a61ef3bbc", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DescriptionPatientDoctor
0Q. What does abutment of the nerve root mean?Hi doctor,I am just wondering what is abutting...Hi. I have gone through your query with dilige...
1Q. What should I do to reduce my weight gained...Hi doctor, I am a 22-year-old female who was d...Hi. You have really done well with the hypothy...
2Q. I have started to get lots of acne on my fa...Hi doctor! I used to have clear skin but since...Hi there Acne has multifactorial etiology. Onl...
3Q. Why do I have uncomfortable feeling between...Hello doctor,I am having an uncomfortable feel...Hello. The popping and discomfort what you fel...
4Q. My symptoms after intercourse threatns me e...Hello doctor,Before two years had sex with a c...Hello. The HIV test uses a finger prick blood ...
\n", "
" ], "text/plain": [ " Description \\\n", "0 Q. What does abutment of the nerve root mean? \n", "1 Q. What should I do to reduce my weight gained... \n", "2 Q. I have started to get lots of acne on my fa... \n", "3 Q. Why do I have uncomfortable feeling between... \n", "4 Q. My symptoms after intercourse threatns me e... \n", "\n", " Patient \\\n", "0 Hi doctor,I am just wondering what is abutting... \n", "1 Hi doctor, I am a 22-year-old female who was d... \n", "2 Hi doctor! I used to have clear skin but since... \n", "3 Hello doctor,I am having an uncomfortable feel... \n", "4 Hello doctor,Before two years had sex with a c... \n", "\n", " Doctor \n", "0 Hi. I have gone through your query with dilige... \n", "1 Hi. You have really done well with the hypothy... \n", "2 Hi there Acne has multifactorial etiology. Onl... \n", "3 Hello. The popping and discomfort what you fel... \n", "4 Hello. The HIV test uses a finger prick blood ... " ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 20, "id": "42910cf2-5834-4e82-a047-cb0cd69d88cb", "metadata": {}, "outputs": [], "source": [ "import os\n", "import pyarrow.parquet as pq\n", "\n", "def generate_hf_metadata(parquet_file_path, dataset_name, split_name='train', split_path_pattern='data/train-*'):\n", " # Read Parquet file into Arrow Table\n", " table = pq.read_table(parquet_file_path)\n", "\n", " # Convert Arrow Table to Pandas DataFrame\n", " df = table.to_pandas()\n", "\n", " # Get information about the dataset\n", " num_bytes = os.path.getsize(parquet_file_path)\n", " num_examples = len(df)\n", "\n", " # Create metadata dictionary without the 'metadata' key\n", " metadata = {\n", " 'configs': [\n", " {\n", " 'config_name': 'default',\n", " 'data_files': [\n", " {\n", " 'split': split_name,\n", " 'path': split_path_pattern\n", " }\n", " ]\n", " }\n", " ],\n", " 'dataset_info': {\n", " 'features': [{'name': col, 'dtype': str(df[col].dtype)} for col in df.columns],\n", " 'splits': [\n", " {\n", " 'name': split_name,\n", " 'num_bytes': num_bytes,\n", " 'num_examples': num_examples\n", " }\n", " ],\n", " 'download_size': num_bytes,\n", " 'dataset_size': num_bytes\n", " }\n", " }\n", "\n", " # Save metadata to a YAML file\n", " metadata_file_path = f'{dataset_name}_metadata.yaml'\n", " with open(metadata_file_path, 'w') as metadata_file:\n", " metadata_file.write(str(metadata))\n", "\n", " print(f'Metadata file saved at: {metadata_file_path}')\n", "\n" ] }, { "cell_type": "code", "execution_count": 21, "id": "5a3b5921-5ec5-46f6-980d-15135fab3765", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Metadata file saved at: dialogues_metadata.yaml\n" ] } ], "source": [ "# Example usage\n", "parquet_file_path = './data/parquet/dialogues.parquet'\n", "dataset_name = 'dialogues'\n", "generate_hf_metadata(parquet_file_path, dataset_name)" ] }, { "cell_type": "code", "execution_count": 22, "id": "8339b4a1-2c02-427e-8069-f3ad24d7f118", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Markdown file saved at: dialogues_dataset_card.md\n" ] } ], "source": [ "import yaml\n", "\n", "def generate_markdown_from_metadata(yaml_file_path, dataset_name, dataset_card_path):\n", " # Load metadata from YAML file\n", " with open(yaml_file_path, 'r') as yaml_file:\n", " metadata = yaml.load(yaml_file, Loader=yaml.FullLoader)\n", "\n", " # Generate Markdown content\n", " markdown_content = f\"---\\n{yaml.dump(metadata)}\\n---\\n# Dataset Card for \\\"{dataset_name}\\\"\\n\\n[More Information needed](https://github.com/huggingface/datasets/blob/main/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)\"\n", "\n", " # Save Markdown content to file\n", " with open(dataset_card_path, 'w') as md_file:\n", " md_file.write(markdown_content)\n", "\n", " print(f'Markdown file saved at: {dataset_card_path}')\n", "\n", "# Example usage\n", "yaml_file_path = 'dialogues_metadata.yaml'\n", "dataset_name = 'dialogues'\n", "dataset_card_path = 'dialogues_dataset_card.md'\n", "generate_markdown_from_metadata(yaml_file_path, dataset_name, dataset_card_path)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "0d4bb2ab-ab43-4386-b9cb-d3c4c1fa06c7", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python (textgen)", "language": "python", "name": "texgen" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" } }, "nbformat": 4, "nbformat_minor": 5 }