Spaces:

Md-Hakim
/

text-summarization

Sleeping

File size: 8,774 Bytes

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.chdir('../')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'c:\\\\mlops projects\\\\text-summarization'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%pwd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from dataclasses import dataclass\n",
    "from pathlib import Path\n",
    "@dataclass(frozen=True)\n",
    "class DataTransformationConfig:\n",
    "    root_dir : Path\n",
    "    data_path : Path\n",
    "    tokenizer_name : Path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from textsummarizer.constants import *\n",
    "from textsummarizer.utils.common import read_yaml, create_directories\n",
    "\n",
    "\n",
    "class ConfigurationManager:\n",
    "    def __init__(\n",
    "        self,\n",
    "        config_filepath = CONFIG_FILE_PATH,\n",
    "        params_filepath = PARAMS_FILE_PATH):\n",
    "\n",
    "        self.config = read_yaml(config_filepath)\n",
    "        self.params = read_yaml(params_filepath)\n",
    "\n",
    "        create_directories([self.config.artifacts_root])\n",
    "\n",
    "\n",
    "    \n",
    "    def get_data_transformation_config(self) -> DataTransformationConfig:\n",
    "        config = self.config.data_transformation\n",
    "\n",
    "        create_directories([config.root_dir])\n",
    "\n",
    "        data_transformation_config = DataTransformationConfig(\n",
    "            root_dir=config.root_dir,\n",
    "            data_path=config.data_path,\n",
    "            tokenizer_name = config.tokenizer_name\n",
    "        )\n",
    "\n",
    "        return data_transformation_config\n",
    "\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-08-11 18:13:05,488: INFO: config: PyTorch version 2.2.2+cu121 available.]\n",
      "[2024-08-11 18:13:05,490: INFO: config: TensorFlow version 2.12.0 available.]\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "from textsummarizer.logging import logger\n",
    "from transformers import AutoTokenizer\n",
    "from datasets import load_dataset, load_from_disk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "class DataTransformation:\n",
    "    def __init__(self, config : DataTransformationConfig):\n",
    "        self.config = config\n",
    "        self.tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_name)\n",
    "        \n",
    "        \n",
    "    def convert_examples_to_features(self, example_batch):\n",
    "        input_encoding = self.tokenizer(example_batch['dialogue'], max_length = 1024, truncation = True)\n",
    "        \n",
    "        with self.tokenizer.as_target_tokenizer():\n",
    "            target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True )\n",
    "            \n",
    "        return {\n",
    "            'input_ids' : input_encoding['input_ids'],\n",
    "            'attention_mask': input_encoding['attention_mask'],\n",
    "            'labels': target_encodings['input_ids']\n",
    "        }\n",
    "        \n",
    "    def convert(self):\n",
    "        dataset_samsum = load_from_disk(self.config.data_path)\n",
    "        dataset_samsum_pt = dataset_samsum.map(self.convert_examples_to_features, batched = True)\n",
    "        dataset_samsum_pt.save_to_disk(os.path.join(self.config.root_dir,\"samsum_dataset\"))        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-08-11 18:13:44,678: INFO: common: yaml file: config\\config.yaml loaded successfully]\n",
      "[2024-08-11 18:13:44,681: INFO: common: yaml file: params.yaml loaded successfully]\n",
      "[2024-08-11 18:13:44,684: INFO: common: created directory at: artifacts]\n",
      "[2024-08-11 18:13:44,686: INFO: common: created directory at: artifacts/data_transformation]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\User\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "bdedbcfbff63497081e37ad9b20a6c31",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/14732 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\User\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:4126: UserWarning: `as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your labels by using the argument `text_target` of the regular `__call__` method (either in the same call as your input texts if you use the same keyword arguments, or in a separate call.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3b8826d099004000a2a037e32bbdf1cc",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/819 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4a5e1728a7d142d3b767f7b9c8f14c6f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/818 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "54194e4ec3de42738a2107fa26673aef",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Saving the dataset (0/1 shards):   0%|          | 0/14732 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0209e20f794e4e3ab60ef282b98b8bb3",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Saving the dataset (0/1 shards):   0%|          | 0/819 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0a89fb4c0a96413782a55206d087a2a6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Saving the dataset (0/1 shards):   0%|          | 0/818 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "try:\n",
    "    config = ConfigurationManager()\n",
    "    data_transformation_config = config.get_data_transformation_config()\n",
    "    data_transformation = DataTransformation(config=data_transformation_config)\n",
    "    data_transformation.convert()\n",
    "except Exception as e:\n",
    "    raise e"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}