Spaces:

satyam998
/

pegasus-summary-lm

Sleeping

App Files Files Community

satyam998 commited on May 26, 2024

Commit

95576a3

0 Parent(s):

Initial commit

Browse files

Files changed (41) hide show

.github/workflows/main.yml +34 -0
.gitignore +162 -0
Dockerfile +17 -0
LICENSE +201 -0
README.md +11 -0
app.py +36 -0
config/config.yaml +34 -0
main.py +66 -0
params.yaml +10 -0
requirements.txt +23 -0
research/data_ingestion.ipynb +183 -0
research/data_transformation.ipynb +234 -0
research/data_validation.ipynb +197 -0
research/experiment.ipynb +74 -0
research/model_evaluation.ipynb +265 -0
research/model_trainer.ipynb +239 -0
setup.py +22 -0
src/summarylm/__init__.py +0 -0
src/summarylm/components/__init__.py +0 -0
src/summarylm/components/data_ingestion.py +39 -0
src/summarylm/components/data_transformation.py +107 -0
src/summarylm/components/data_validation.py +45 -0
src/summarylm/components/model_evaluation.py +69 -0
src/summarylm/components/model_trainer.py +69 -0
src/summarylm/config/__init__.py +0 -0
src/summarylm/config/configuration.py +100 -0
src/summarylm/config/gcloud_syncer.py +34 -0
src/summarylm/constants/__init__.py +4 -0
src/summarylm/entity/__init__.py +45 -0
src/summarylm/exception/__init__.py +34 -0
src/summarylm/logging/__init__.py +22 -0
src/summarylm/pipeline/__init__.py +0 -0
src/summarylm/pipeline/data_ingestion.py +22 -0
src/summarylm/pipeline/data_transformation.py +22 -0
src/summarylm/pipeline/data_validation.py +22 -0
src/summarylm/pipeline/model_evaluation.py +22 -0
src/summarylm/pipeline/model_trainer.py +22 -0
src/summarylm/pipeline/prediction.py +17 -0
src/summarylm/utils/__init__.py +0 -0
src/summarylm/utils/common.py +65 -0
template.py +53 -0

.github/workflows/main.yml ADDED Viewed

	@@ -0,0 +1,34 @@

+name: Sync to Hugging Face hub
+on:
+  push:
+    branches: [main]
+  # To run this workflow manually from the Actions tab
+  workflow_dispatch:
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          lfs: true
+      - name: Set up Git user
+        run: |
+          git config --global user.email "[email protected]"
+          git config --global user.name "satyam998"
+      - name: Create a new branch
+        run: |
+          git checkout --orphan temp
+          git add -A
+          git commit -m "Initial commit"
+          git branch -D main
+          git branch -m main
+      - name: Force push to hub
+        env:
+          HF: ${{ secrets.HG }}
+        run: git push --force https://satyam998:[email protected]/spaces/satyam998/pegasus-summary-lm main

.gitignore ADDED Viewed

	@@ -0,0 +1,162 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+artifacts/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+"artifacts/"

Dockerfile ADDED Viewed

	@@ -0,0 +1,17 @@

+FROM python:3.9
+RUN useradd -m -u 1000 user
+WORKDIR /app
+COPY --chown=user . /app
+RUN pip install -r requirements.txt
+RUN pip install --upgrade accelerate
+RUN pip uninstall -y transformers accelerate
+RUN pip install transformers accelerate
+RUN mkdir -p /app/logs
+RUN chmod 777 /app/logs
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,11 @@

+---
+title: Pegasus Summary Lm
+emoji: 🔥
+colorFrom: green
+colorTo: pink
+sdk: docker
+pinned: false
+license: apache-2.0
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from fastapi import FastAPI
+import uvicorn
+import sys
+import os
+from fastapi.templating import Jinja2Templates
+from starlette.responses import RedirectResponse
+from fastapi.responses import Response
+from summarylm.pipeline.prediction import PredictionPipeline
+from summarylm.exception import CustomException
+text:str = "What is Text Summarization?"
+app = FastAPI()
+@app.get("/", tags=["authentication"])
+async def index():
+    return RedirectResponse(url='/docs')
+@app.get("/train")
+async def training():
+    try:
+        os.system("python main.py")
+        return Response("Training Successful!!")
+    except Exception as e:
+        return Response(f"Error Occurred! {e}")
+@app.post("/predict")
+async def predict_route(text, max_length: int = 128):
+    try:
+        print(type(max_length))
+        obj = PredictionPipeline()
+        text = obj.predict(text, max_length)
+        return text
+    except Exception as e:
+        raise CustomException(e, sys) from e

config/config.yaml ADDED Viewed

	@@ -0,0 +1,34 @@

+artifacts_root: artifacts
+data_ingestion:
+  root_dir: artifacts/data_ingestion
+  ALL_HUGGINGFACE_DATA: ["d0rj/wikisum", "multi_news"]
+  LOCAL_DATA_FILE: ["artifacts/data_ingestion/wikisum", "artifacts/data_ingestion/multi_news"]
+data_validation:
+  root_dir: artifacts/data_validation
+  STATUS_FILE: artifacts/data_validation/status.txt
+  ALL_REQUIRED_DATA: ["wikisum", "multi_news"]
+  ALL_REQUIRED_FILES: ["train", "test", "validation"]
+data_transformation:
+  root_dir: artifacts/data_transformation
+  data_path: artifacts/data_ingestion/
+  ALL_REQUIRED_DATA: ["wikisum", "multi_news"]
+  tokenizer_name: google/pegasus-cnn_dailymail
+model_trainer:
+  root_dir: artifacts/model_trainer
+  data_path: artifacts/data_transformation/dataset
+  model_ckpt: google/pegasus-cnn_dailymail
+model_evaluation:
+  root_dir: artifacts/model_evaluation
+  data_path: artifacts/data_transformation/dataset
+  model_path: artifacts/model_trainer/pegasus-summary-lm
+  tokenizer_path: artifacts/model_trainer/tokenizer
+  metric_file_name: artifacts/model_evaluation/metrics.csv

main.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import sys
+from summarylm.pipeline.data_ingestion import DataIngestionPipeline
+from summarylm.pipeline.data_validation import DataValidationPipeline
+from summarylm.pipeline.data_transformation import DataTransformationPipeline
+from summarylm.pipeline.model_trainer import ModelTrainerPipeline
+from summarylm.pipeline.model_evaluation import ModelEvaluationPipeline
+from summarylm.logging import logger
+from summarylm.exception import CustomException
+# data ingestion
+STAGE_NAME = "Data Ingestion"
+try:
+    logger.info(f"Starting {STAGE_NAME} stage...")
+    data_ingestion = DataIngestionPipeline()
+    data_ingestion.main()
+    logger.info(f"Completed {STAGE_NAME} stage...")
+except Exception as e:
+    raise CustomException(e, sys) from e
+# data validation
+STAGE_NAME = "Data Validation"
+try:
+    logger.info(f"Starting {STAGE_NAME} stage...")
+    data_validation = DataValidationPipeline()
+    data_validation.main()
+    logger.info(f"Completed {STAGE_NAME} stage...")
+except Exception as e:
+    raise CustomException(e, sys) from e
+# data transformation
+STAGE_NAME = "Data Transformation"
+try:
+    logger.info(f"Starting {STAGE_NAME} stage...")
+    data_transformation= DataTransformationPipeline()
+    data_transformation.main()
+    logger.info(f"Completed {STAGE_NAME} stage...")
+except Exception as e:
+    raise CustomException(e, sys) from e
+# model trainer
+STAGE_NAME = "Model Trainer"
+try:
+    logger.info(f"Starting {STAGE_NAME} stage...")
+    model_trainer= ModelTrainerPipeline()
+    model_trainer.main()
+    logger.info(f"Completed {STAGE_NAME} stage...")
+except Exception as e:
+    raise CustomException(e, sys) from e
+# model evaluation
+STAGE_NAME = "Model Evaluation"
+try:
+    logger.info(f"Starting {STAGE_NAME} stage...")
+    model_trainer= ModelEvaluationPipeline()
+    model_trainer.main()
+    logger.info(f"Completed {STAGE_NAME} stage...")
+except Exception as e:
+    raise CustomException(e, sys) from e

params.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+TrainingArguments:
+  num_train_epochs: 1
+  warmup_steps: 500
+  per_device_train_batch_size: 1
+  weight_decay: 0.01
+  logging_steps: 10
+  evaluation_strategy: steps
+  eval_steps: 500
+  save_steps: 1e6
+  gradient_accumulation_steps: 16

requirements.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+transformers
+transformers[sentencepiece]
+transformers[torch]
+datasets
+sacrebleu
+rouge_score
+py7zr
+pandas
+nltk
+tqdm
+PyYAML
+matplotlib
+torch
+notebook
+boto3
+mypy-boto3-s3
+python-box==7.1.1
+ensure==1.0.4
+fastapi==0.78.0
+uvicorn==0.29.0
+Jinja2==3.1.4
+google-cloud-storage
+-e .

research/data_ingestion.ipynb ADDED Viewed

	@@ -0,0 +1,183 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import zipfile\n",
+    "from dataclasses import dataclass\n",
+    "from pathlib import Path\n",
+    "\n",
+    "from summarylm.logging import logger\n",
+    "from summarylm.constants import *\n",
+    "from summarylm.utils.common import read_yaml, create_directories, get_size"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.chdir(\"../\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@dataclass(frozen=True)\n",
+    "class DataIngestionConfig:\n",
+    "    root_dir: Path\n",
+    "    ALL_HUGGINGFACE_DATA: list\n",
+    "    LOCAL_DATA_FILE: list"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class ConfigurationManager:\n",
+    "    def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH) -> None:\n",
+    "        self.config = read_yaml(config_filepath)\n",
+    "        self.params = read_yaml(params_filepath)\n",
+    "\n",
+    "        create_directories([self.config.artifacts_root])\n",
+    "\n",
+    "    def get_data_ingestion_config(self) -> DataIngestionConfig:\n",
+    "        config = self.config.data_ingestion\n",
+    "\n",
+    "        create_directories([config.root_dir])\n",
+    "\n",
+    "        data_ingestion_config = DataIngestionConfig(\n",
+    "            root_dir=config.root_dir,\n",
+    "            ALL_HUGGINGFACE_DATA=config.ALL_HUGGINGFACE_DATA,\n",
+    "            LOCAL_DATA_FILE = config.LOCAL_DATA_FILE,\n",
+    "        )\n",
+    "\n",
+    "        return data_ingestion_config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "class DataIngestion:\n",
+    "    def __init__(self, config: DataIngestionConfig):\n",
+    "        self.config = config\n",
+    "\n",
+    "    def download_data(self):\n",
+    "        for i in range(len(self.config.LOCAL_DATA_FILE)):\n",
+    "            if not os.path.exists(self.config.LOCAL_DATA_FILE[i]):\n",
+    "                dataset = load_dataset(self.config.ALL_HUGGINGFACE_DATA[i])\n",
+    "                dataset.save_to_disk(self.config.LOCAL_DATA_FILE[i])\n",
+    "                logger.info(f\"{self.config.ALL_HUGGINGFACE_DATA[i]} downloaded!\")\n",
+    "            else:\n",
+    "                logger.info(f\"File already exists of size: {get_size(Path(self.config.LOCAL_DATA_FILE[i]))}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-05-23 07:53:04,706: INFO: common: Yaml file: config\\config.yaml loaded successfully]\n",
+      "[2024-05-23 07:53:04,709: INFO: common: Yaml file: params.yaml loaded successfully]\n",
+      "[2024-05-23 07:53:04,710: INFO: common: Directory created successfully at: artifacts]\n",
+      "[2024-05-23 07:53:04,711: INFO: common: Directory created successfully at: artifacts/data_ingestion]\n",
+      "[2024-05-23 07:53:04,711: INFO: 368978256: File already exists of size: ~0 KB]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading data: 100%|██████████| 295M/295M [00:34<00:00, 8.46MB/s] \n",
+      "Downloading data: 100%|██████████| 28.3M/28.3M [00:05<00:00, 5.38MB/s]\n",
+      "Downloading data: 100%|██████████| 39.5M/39.5M [00:06<00:00, 5.72MB/s]\n",
+      "Downloading data: 100%|██████████| 40.1M/40.1M [00:06<00:00, 5.83MB/s]\n",
+      "Generating train split: 100%|██████████| 44972/44972 [00:03<00:00, 13618.69 examples/s]\n",
+      "Generating validation split: 100%|██████████| 5622/5622 [00:00<00:00, 25120.36 examples/s]\n",
+      "Generating test split: 100%|██████████| 5622/5622 [00:00<00:00, 22323.24 examples/s]\n",
+      "Saving the dataset (2/2 shards): 100%|██████████| 44972/44972 [00:07<00:00, 5653.51 examples/s] \n",
+      "Saving the dataset (1/1 shards): 100%|██████████| 5622/5622 [00:00<00:00, 15343.69 examples/s]\n",
+      "Saving the dataset (1/1 shards): 100%|██████████| 5622/5622 [00:00<00:00, 15216.24 examples/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-05-23 07:54:25,968: INFO: 368978256: multi_news downloaded!]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "try:\n",
+    "    config = ConfigurationManager()\n",
+    "    data_ingestion_config = config.get_data_ingestion_config()\n",
+    "    data_ingestion = DataIngestion(config=data_ingestion_config)\n",
+    "    data_ingestion.download_data()\n",
+    "except Exception as e:\n",
+    "    raise e "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

research/data_transformation.ipynb ADDED Viewed

	@@ -0,0 +1,234 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "os.chdir(\"../\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dataclasses import dataclass\n",
+    "from pathlib import Path\n",
+    "\n",
+    "@dataclass(frozen=True)\n",
+    "class DataTransformationConfig:\n",
+    "    root_dir: Path\n",
+    "    data_path: Path\n",
+    "    ALL_REQUIRED_DATA: Path\n",
+    "    tokenizer_name: Path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from summarylm.constants import *\n",
+    "from summarylm.utils.common import read_yaml, create_directories"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class ConfigurationManager:\n",
+    "    def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH):\n",
+    "        self.config = read_yaml(config_filepath)\n",
+    "        self.params = read_yaml(params_filepath)\n",
+    "\n",
+    "        create_directories([self.config.artifacts_root])\n",
+    "\n",
+    "    def get_data_transformation_config(self) -> DataTransformationConfig:\n",
+    "        config = self.config.data_transformation\n",
+    "\n",
+    "        create_directories([config.root_dir])\n",
+    "\n",
+    "        data_transformation_config = DataTransformationConfig(\n",
+    "            root_dir=config.root_dir,\n",
+    "            data_path=config.data_path,\n",
+    "            ALL_REQUIRED_DATA=config.ALL_REQUIRED_DATA,\n",
+    "            tokenizer_name=config.tokenizer_name\n",
+    "        )\n",
+    "\n",
+    "        return data_transformation_config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "from summarylm.logging import logger\n",
+    "from summarylm.exception import CustomException\n",
+    "from transformers import AutoTokenizer\n",
+    "from datasets import load_dataset, load_from_disk\n",
+    "from datasets import concatenate_datasets, DatasetDict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class DataTransformation:\n",
+    "    def __init__(self, config: DataTransformationConfig):\n",
+    "        self.config = config\n",
+    "        self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)\n",
+    "\n",
+    "    def convert_data_into_right_format(self, datasets: list) -> DatasetDict:\n",
+    "        # loading all datasets\n",
+    "        loaded_datasets = {}\n",
+    "        print(\"Loading the dataset\")\n",
+    "        for data in datasets:\n",
+    "            loaded_datasets[data] = load_from_disk(data)\n",
+    "\n",
+    "        dataset1 = loaded_datasets[datasets[0]]\n",
+    "        dataset2 = loaded_datasets[datasets[1]]\n",
+    "        print(\"Dataset loaded\")\n",
+    "\n",
+    "        # removing unwanted columns from dataset1\n",
+    "        dataset1_train = dataset1['train'].select_columns(['article', 'summary'])\n",
+    "        dataset1_test = dataset1['test'].select_columns(['article', 'summary'])\n",
+    "        dataset1_validation = dataset1['validation'].select_columns(['article', 'summary'])\n",
+    "\n",
+    "        # renaming data column name of dataset1\n",
+    "        dataset1_train = dataset1_train.rename_column('article', 'text')\n",
+    "        dataset1_test = dataset1_test.rename_column('article', 'text')\n",
+    "        dataset1_validation = dataset1_validation.rename_column('article', 'text')\n",
+    "\n",
+    "        # renaming data column name of dataset2\n",
+    "        dataset2_train = dataset2['train'].rename_column('document', 'text')\n",
+    "        dataset2_test = dataset2['test'].rename_column('document', 'text')\n",
+    "        dataset2_validation = dataset2['validation'].rename_column('document', 'text')\n",
+    "\n",
+    "        # concatenate_datasets\n",
+    "        dataset_train = concatenate_datasets([dataset1_train, dataset2_train])\n",
+    "        dataset_test = concatenate_datasets([dataset1_test, dataset2_test])\n",
+    "        dataset_validation = concatenate_datasets([dataset1_validation, dataset2_validation])\n",
+    "\n",
+    "        # loading teh dataset into DatasetDict\n",
+    "        dataset = DatasetDict({\n",
+    "            \"train\": dataset_train,\n",
+    "            \"validation\": dataset_validation,\n",
+    "            \"test\": dataset_test,\n",
+    "        })\n",
+    "\n",
+    "        return dataset\n",
+    "\n",
+    "    def convert_examples_to_features(self, example_batch):\n",
+    "        input_encodings = self.tokenizer(example_batch['text'], max_length = 1024, truncation = True)\n",
+    "    \n",
+    "        with self.tokenizer.as_target_tokenizer():\n",
+    "            target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True)\n",
+    "        \n",
+    "        return {\n",
+    "            'input_ids': input_encodings['input_ids'],\n",
+    "            'attention_mask': input_encodings['attention_mask'],\n",
+    "            'labels': target_encodings['input_ids']\n",
+    "        }\n",
+    "    \n",
+    "    def convert(self):\n",
+    "        data1 = os.path.join(self.config.data_path, self.config.ALL_REQUIRED_DATA[0])\n",
+    "        data2 = os.path.join(self.config.data_path, self.config.ALL_REQUIRED_DATA[1])\n",
+    "\n",
+    "        dataset = self.convert_data_into_right_format([data1, data2])\n",
+    "        dataset_pt = dataset.map(self.convert_examples_to_features, batched=True)\n",
+    "        dataset_pt.save_to_disk(os.path.join(self.config.root_dir, \"dataset\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-05-23 09:04:24,048: INFO: common: Yaml file: config\\config.yaml loaded successfully]\n",
+      "[2024-05-23 09:04:24,051: INFO: common: Yaml file: params.yaml loaded successfully]\n",
+      "[2024-05-23 09:04:24,052: INFO: common: Directory created successfully at: artifacts]\n",
+      "[2024-05-23 09:04:24,053: INFO: common: Directory created successfully at: artifacts/data_transformation]\n",
+      "Loading the dataset\n",
+      "Dataset loaded\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Map:   0%|          | 0/80747 [00:00<?, ? examples/s]d:\\Satyam Mishra\\NLP Project\\Text Summarization\\env\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:3921: UserWarning: `as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your labels by using the argument `text_target` of the regular `__call__` method (either in the same call as your input texts if you use the same keyword arguments, or in a separate call.\n",
+      "  warnings.warn(\n",
+      "Map: 100%|██████████| 80747/80747 [11:43<00:00, 114.72 examples/s]\n",
+      "Map: 100%|██████████| 7622/7622 [01:20<00:00, 94.22 examples/s] \n",
+      "Map: 100%|██████████| 7622/7622 [01:59<00:00, 63.80 examples/s] \n",
+      "Saving the dataset (3/3 shards): 100%|██████████| 80747/80747 [00:13<00:00, 5803.62 examples/s] \n",
+      "Saving the dataset (1/1 shards): 100%|██████████| 7622/7622 [00:01<00:00, 4202.00 examples/s]\n",
+      "Saving the dataset (1/1 shards): 100%|██████████| 7622/7622 [00:01<00:00, 6924.25 examples/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "try:\n",
+    "    config = ConfigurationManager()\n",
+    "    data_transformation_config = config.get_data_transformation_config()\n",
+    "    data_transformation = DataTransformation(config=data_transformation_config)\n",
+    "    data_transformation.convert()\n",
+    "except Exception as e:\n",
+    "    raise e"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

research/data_validation.ipynb ADDED Viewed

	@@ -0,0 +1,197 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.chdir(\"../\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'d:\\\\Satyam Mishra\\\\NLP Project\\\\Text Summarization'"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%pwd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dataclasses import dataclass\n",
+    "from pathlib import Path\n",
+    "\n",
+    "@dataclass(frozen=True)\n",
+    "class DataValidationConfig:\n",
+    "    root_dir: Path\n",
+    "    STATUS_FILE: str\n",
+    "    ALL_REQUIRED_DATA: list\n",
+    "    ALL_REQUIRED_FILES: list"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from summarylm.constants import *\n",
+    "from summarylm.utils.common import read_yaml, create_directories"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class ConfigurationManager:\n",
+    "    def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH):\n",
+    "        self.config = read_yaml(config_filepath)\n",
+    "        self.params = read_yaml(params_filepath)\n",
+    "\n",
+    "        create_directories([self.config.artifacts_root])\n",
+    "\n",
+    "    def get_data_validation_config(self) -> DataValidationConfig:\n",
+    "        config = self.config.data_validation\n",
+    "\n",
+    "        create_directories([config.root_dir])\n",
+    "\n",
+    "        data_validation_config = DataValidationConfig(\n",
+    "            root_dir=config.root_dir,\n",
+    "            STATUS_FILE=config.STATUS_FILE,\n",
+    "            ALL_REQUIRED_DATA=config.ALL_REQUIRED_DATA,\n",
+    "            ALL_REQUIRED_FILES=config.ALL_REQUIRED_FILES,\n",
+    "        )\n",
+    "\n",
+    "        return data_validation_config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "from summarylm.logging import logger\n",
+    "from summarylm.exception import CustomException"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class DataValidation:\n",
+    "    def __init__(self, config: DataValidationConfig):\n",
+    "        self.config = config\n",
+    "\n",
+    "    def validate_all_files_exist(self) -> bool:\n",
+    "        try:\n",
+    "            validation_status = None\n",
+    "\n",
+    "            for data in self.config.ALL_REQUIRED_DATA:\n",
+    "                all_files = os.listdir(os.path.join(\"artifacts\", \"data_ingestion\", data))\n",
+    "\n",
+    "                for file in all_files:\n",
+    "                    if file not in self.config.ALL_REQUIRED_FILES:\n",
+    "                        validation_status = False\n",
+    "\n",
+    "                        with open(self.config.STATUS_FILE, 'w') as f:\n",
+    "                            f.write(f\"Validation status: {validation_status}\")\n",
+    "                    else:\n",
+    "                        validation_status = True\n",
+    "\n",
+    "                        with open(self.config.STATUS_FILE, 'w') as f:\n",
+    "                            f.write(f\"Validation status: {validation_status}\")\n",
+    "\n",
+    "            return validation_status\n",
+    "        except Exception as e:\n",
+    "            raise CustomException(e, sys) from e"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-05-23 08:11:43,852: INFO: common: Yaml file: config\\config.yaml loaded successfully]\n",
+      "[2024-05-23 08:11:43,856: INFO: common: Yaml file: params.yaml loaded successfully]\n",
+      "[2024-05-23 08:11:43,857: INFO: common: Directory created successfully at: artifacts]\n",
+      "[2024-05-23 08:11:43,858: INFO: common: Directory created successfully at: artifacts/data_validation]\n"
+     ]
+    }
+   ],
+   "source": [
+    "try:\n",
+    "    config = ConfigurationManager()\n",
+    "    data_validation_config = config.get_data_validation_config()\n",
+    "    data_validation = DataValidation(config=data_validation_config)\n",
+    "    data_validation.validate_all_files_exist()\n",
+    "except Exception as e:\n",
+    "    raise CustomException(e, sys) from e"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

research/experiment.ipynb ADDED Viewed

	@@ -0,0 +1,74 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from ensure import ensure_annotations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@ensure_annotations\n",
+    "def multiply(a: int, b: int) -> int:\n",
+    "    return a * b"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "EnsureError",
+     "evalue": "Argument b of type <class 'str'> to <function multiply at 0x000001A07A809D00> does not match annotation type <class 'int'>",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mEnsureError\u001b[0m                               Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[6], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[43mmultiply\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m2\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m3\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[1;32md:\\Satyam Mishra\\NLP Project\\Text Summarization\\env\\Lib\\site-packages\\ensure\\main.py:870\u001b[0m, in \u001b[0;36mWrappedFunctionReturn.__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m    868\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(value, templ):\n\u001b[0;32m    869\u001b[0m         msg \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mArgument \u001b[39m\u001b[38;5;132;01m{arg}\u001b[39;00m\u001b[38;5;124m of type \u001b[39m\u001b[38;5;132;01m{valt}\u001b[39;00m\u001b[38;5;124m to \u001b[39m\u001b[38;5;132;01m{f}\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdoes not match annotation type \u001b[39m\u001b[38;5;132;01m{t}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m--> 870\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m EnsureError(msg\u001b[38;5;241m.\u001b[39mformat(arg\u001b[38;5;241m=\u001b[39marg, f\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mf, t\u001b[38;5;241m=\u001b[39mtempl, valt\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mtype\u001b[39m(value)))\n\u001b[0;32m    872\u001b[0m return_val \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mf(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m    873\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(return_val, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturn_templ):\n",
+      "\u001b[1;31mEnsureError\u001b[0m: Argument b of type <class 'str'> to <function multiply at 0x000001A07A809D00> does not match annotation type <class 'int'>"
+     ]
+    }
+   ],
+   "source": [
+    "multiply(2, \"3\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

research/model_evaluation.ipynb ADDED Viewed

	@@ -0,0 +1,265 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.chdir(\"../\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'d:\\\\Satyam Mishra\\\\NLP Project\\\\Text Summarization'"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%pwd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dataclasses import dataclass\n",
+    "from pathlib import Path\n",
+    "\n",
+    "@dataclass(frozen=True)\n",
+    "class ModelEvaluationConfig:\n",
+    "    root_dir: Path\n",
+    "    data_path: Path\n",
+    "    model_path: Path\n",
+    "    tokenizer_path: Path\n",
+    "    metric_file_name: Path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from summarylm.constants import *\n",
+    "from summarylm.utils.common import read_yaml, create_directories"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class ConfigurationManager:\n",
+    "    def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH):\n",
+    "        self.config = read_yaml(config_filepath)\n",
+    "        self.params = read_yaml(params_filepath)\n",
+    "\n",
+    "        create_directories([self.config.artifacts_root])\n",
+    "\n",
+    "    def get_model_evaluation_config(self) -> ModelEvaluationConfig:\n",
+    "        config = self.config.model_evaluation\n",
+    "\n",
+    "        create_directories([config.root_dir])\n",
+    "\n",
+    "        model_evaluation_config = ModelEvaluationConfig(\n",
+    "            root_dir=config.root_dir,\n",
+    "            data_path=config.data_path,\n",
+    "            model_path=config.model_path,\n",
+    "            tokenizer_path=config.tokenizer_path,\n",
+    "            metric_file_name=config.metric_file_name,\n",
+    "        )\n",
+    "\n",
+    "        return model_evaluation_config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-05-21 08:29:30,191: INFO: config: PyTorch version 2.3.0 available.]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
+    "from datasets import load_dataset, load_from_disk, load_metric\n",
+    "import torch\n",
+    "import pandas as pd\n",
+    "from tqdm import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class ModelEvaluation:\n",
+    "    def __init__(self, config: ModelEvaluationConfig):\n",
+    "        self.config = config\n",
+    "    \n",
+    "    def generate_batch_size_chunks(self, list_of_elements, batch_size):\n",
+    "        \"\"\"\n",
+    "        Split the dataset into smaller batches that we can process simultaneously\n",
+    "        Yield successive batch-sized chunks from list_of_elements.\n",
+    "        \"\"\"\n",
+    "        for i in range(0, len(list_of_elements), batch_size):\n",
+    "            yield list_of_elements[i : i + batch_size]\n",
+    "        \n",
+    "    def calculate_metric_on_test_ds(self, dataset, metric, model, tokenizer, batch_size=16,\n",
+    "                                device=\"cuda\" if torch.cuda.is_available() else \"cpu\",\n",
+    "                                column_text=\"article\",\n",
+    "                                column_summary=\"highlights\"):\n",
+    "        article_batches = list(self.generate_batch_size_chunks(dataset[column_text], batch_size))\n",
+    "        target_batches = list(self.generate_batch_size_chunks(dataset[column_summary], batch_size))\n",
+    "    \n",
+    "        for article_batch, target_batch in tqdm(\n",
+    "            zip(article_batches, target_batches), total=len(article_batches)):\n",
+    "        \n",
+    "            inputs = tokenizer(article_batch, max_length=1024,  truncation=True, \n",
+    "                            padding=\"max_length\", return_tensors=\"pt\")\n",
+    "        \n",
+    "            summaries = model.generate(input_ids=inputs[\"input_ids\"].to(device),\n",
+    "                            attention_mask=inputs[\"attention_mask\"].to(device), \n",
+    "                            length_penalty=0.8, num_beams=8, max_length=128)\n",
+    "        \n",
+    "            ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''\n",
+    "        \n",
+    "            # Finally, we decode the generated texts, \n",
+    "            # replace the  token, and add the decoded texts with the references to the metric.\n",
+    "            decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, \n",
+    "                                    clean_up_tokenization_spaces=True) for s in summaries]      \n",
+    "            \n",
+    "            decoded_summaries = [d.replace(\"\", \" \") for d in decoded_summaries]\n",
+    "            \n",
+    "            \n",
+    "            metric.add_batch(predictions=decoded_summaries, references=target_batch)\n",
+    "            \n",
+    "        #  Finally compute and return the ROUGE scores.\n",
+    "        score = metric.compute()\n",
+    "        return score\n",
+    "    \n",
+    "    def evaluation(self):\n",
+    "        device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "        tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)\n",
+    "        model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path).to(device)\n",
+    "\n",
+    "        # loading data\n",
+    "        dataset_pt = load_from_disk(self.config.data_path)\n",
+    "\n",
+    "        rouge_names = [\"rouge1\", \"rouge2\", \"rougeL\", \"rougeLsum\"]\n",
+    "        \n",
+    "        rouge_metric = load_metric('rouge')\n",
+    "\n",
+    "        score = self.calculate_metric_on_test_ds(dataset_pt['test'][0:10], rouge_metric, model_pegasus, tokenizer, batch_size = 2, column_text = 'text', column_summary= 'summary')\n",
+    "        rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )\n",
+    "        df = pd.DataFrame(rouge_dict, index = ['pegasus'] )\n",
+    "        df.to_csv(self.config.metric_file_name, index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-05-21 08:43:47,280: INFO: common: Yaml file: config\\config.yaml loaded successfully]\n",
+      "[2024-05-21 08:43:47,284: INFO: common: Yaml file: params.yaml loaded successfully]\n",
+      "[2024-05-21 08:43:47,285: INFO: common: Directory created successfully at: artifacts]\n",
+      "[2024-05-21 08:43:47,286: INFO: common: Directory created successfully at: artifacts/model_evaluation]\n"
+     ]
+    },
+    {
+     "ename": "OSError",
+     "evalue": "Incorrect path_or_model_id: 'artifacts/model_trainer/tokenizer'. Please provide either the path to a local folder or the repo_id of a model on the Hub.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mHFValidationError\u001b[0m                         Traceback (most recent call last)",
+      "File \u001b[1;32md:\\Satyam Mishra\\NLP Project\\Text Summarization\\env\\Lib\\site-packages\\transformers\\utils\\hub.py:398\u001b[0m, in \u001b[0;36mcached_file\u001b[1;34m(path_or_repo_id, filename, cache_dir, force_download, resume_download, proxies, token, revision, local_files_only, subfolder, repo_type, user_agent, _raise_exceptions_for_gated_repo, _raise_exceptions_for_missing_entries, _raise_exceptions_for_connection_errors, _commit_hash, **deprecated_kwargs)\u001b[0m\n\u001b[0;32m    396\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m    397\u001b[0m     \u001b[38;5;66;03m# Load from URL or cache if already cached\u001b[39;00m\n\u001b[1;32m--> 398\u001b[0m     resolved_file \u001b[38;5;241m=\u001b[39m \u001b[43mhf_hub_download\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m    399\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpath_or_repo_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    400\u001b[0m \u001b[43m        \u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    401\u001b[0m \u001b[43m        \u001b[49m\u001b[43msubfolder\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43msubfolder\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43msubfolder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    402\u001b[0m \u001b[43m        \u001b[49m\u001b[43mrepo_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrepo_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    403\u001b[0m \u001b[43m        \u001b[49m\u001b[43mrevision\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    404\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    405\u001b[0m \u001b[43m        \u001b[49m\u001b[43muser_agent\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muser_agent\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    406\u001b[0m \u001b[43m        \u001b[49m\u001b[43mforce_download\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mforce_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    407\u001b[0m \u001b[43m        \u001b[49m\u001b[43mproxies\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mproxies\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    408\u001b[0m \u001b[43m        \u001b[49m\u001b[43mresume_download\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    409\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtoken\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    410\u001b[0m \u001b[43m        \u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    411\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    412\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m GatedRepoError \u001b[38;5;28;01mas\u001b[39;00m e:\n",
+      "File \u001b[1;32md:\\Satyam Mishra\\NLP Project\\Text Summarization\\env\\Lib\\site-packages\\huggingface_hub\\utils\\_validators.py:106\u001b[0m, in \u001b[0;36mvalidate_hf_hub_args.<locals>._inner_fn\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m    105\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m arg_name \u001b[38;5;129;01min\u001b[39;00m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrepo_id\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfrom_id\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mto_id\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n\u001b[1;32m--> 106\u001b[0m     \u001b[43mvalidate_repo_id\u001b[49m\u001b[43m(\u001b[49m\u001b[43marg_value\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    108\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m arg_name \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtoken\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m arg_value \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
+      "File \u001b[1;32md:\\Satyam Mishra\\NLP Project\\Text Summarization\\env\\Lib\\site-packages\\huggingface_hub\\utils\\_validators.py:154\u001b[0m, in \u001b[0;36mvalidate_repo_id\u001b[1;34m(repo_id)\u001b[0m\n\u001b[0;32m    153\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m repo_id\u001b[38;5;241m.\u001b[39mcount(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m--> 154\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m HFValidationError(\n\u001b[0;32m    155\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRepo id must be in the form \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrepo_name\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m or \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mnamespace/repo_name\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m:\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m    156\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrepo_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m. Use `repo_type` argument if needed.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m    157\u001b[0m     )\n\u001b[0;32m    159\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m REPO_ID_REGEX\u001b[38;5;241m.\u001b[39mmatch(repo_id):\n",
+      "\u001b[1;31mHFValidationError\u001b[0m: Repo id must be in the form 'repo_name' or 'namespace/repo_name': 'artifacts/model_trainer/tokenizer'. Use `repo_type` argument if needed.",
+      "\nThe above exception was the direct cause of the following exception:\n",
+      "\u001b[1;31mOSError\u001b[0m                                   Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[12], line 7\u001b[0m\n\u001b[0;32m      5\u001b[0m     model_evaluation_config\u001b[38;5;241m.\u001b[39mevaluation()\n\u001b[0;32m      6\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m----> 7\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m e\n",
+      "Cell \u001b[1;32mIn[12], line 5\u001b[0m\n\u001b[0;32m      3\u001b[0m     model_evaluation_config \u001b[38;5;241m=\u001b[39m config\u001b[38;5;241m.\u001b[39mget_model_evaluation_config()\n\u001b[0;32m      4\u001b[0m     model_evaluation_config \u001b[38;5;241m=\u001b[39m ModelEvaluation(config\u001b[38;5;241m=\u001b[39mmodel_evaluation_config)\n\u001b[1;32m----> 5\u001b[0m     \u001b[43mmodel_evaluation_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mevaluation\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m      6\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m      7\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m e\n",
+      "Cell \u001b[1;32mIn[10], line 48\u001b[0m, in \u001b[0;36mModelEvaluation.evaluation\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m     46\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mevaluation\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m     47\u001b[0m     device \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcuda\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mcuda\u001b[38;5;241m.\u001b[39mis_available() \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcpu\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m---> 48\u001b[0m     tokenizer \u001b[38;5;241m=\u001b[39m \u001b[43mAutoTokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtokenizer_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     49\u001b[0m     model_pegasus \u001b[38;5;241m=\u001b[39m AutoModelForSeq2SeqLM\u001b[38;5;241m.\u001b[39mfrom_pretrained(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mmodel_path)\u001b[38;5;241m.\u001b[39mto(device)\n\u001b[0;32m     51\u001b[0m     \u001b[38;5;66;03m# loading data\u001b[39;00m\n",
+      "File \u001b[1;32md:\\Satyam Mishra\\NLP Project\\Text Summarization\\env\\Lib\\site-packages\\transformers\\models\\auto\\tokenization_auto.py:804\u001b[0m, in \u001b[0;36mAutoTokenizer.from_pretrained\u001b[1;34m(cls, pretrained_model_name_or_path, *inputs, **kwargs)\u001b[0m\n\u001b[0;32m    801\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m tokenizer_class\u001b[38;5;241m.\u001b[39mfrom_pretrained(pretrained_model_name_or_path, \u001b[38;5;241m*\u001b[39minputs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m    803\u001b[0m \u001b[38;5;66;03m# Next, let's try to use the tokenizer_config file to get the tokenizer class.\u001b[39;00m\n\u001b[1;32m--> 804\u001b[0m tokenizer_config \u001b[38;5;241m=\u001b[39m \u001b[43mget_tokenizer_config\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    805\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_commit_hash\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m tokenizer_config:\n\u001b[0;32m    806\u001b[0m     kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_commit_hash\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m tokenizer_config[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_commit_hash\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n",
+      "File \u001b[1;32md:\\Satyam Mishra\\NLP Project\\Text Summarization\\env\\Lib\\site-packages\\transformers\\models\\auto\\tokenization_auto.py:637\u001b[0m, in \u001b[0;36mget_tokenizer_config\u001b[1;34m(pretrained_model_name_or_path, cache_dir, force_download, resume_download, proxies, token, revision, local_files_only, subfolder, **kwargs)\u001b[0m\n\u001b[0;32m    634\u001b[0m     token \u001b[38;5;241m=\u001b[39m use_auth_token\n\u001b[0;32m    636\u001b[0m commit_hash \u001b[38;5;241m=\u001b[39m kwargs\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_commit_hash\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m--> 637\u001b[0m resolved_config_file \u001b[38;5;241m=\u001b[39m \u001b[43mcached_file\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m    638\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    639\u001b[0m \u001b[43m    \u001b[49m\u001b[43mTOKENIZER_CONFIG_FILE\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    640\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    641\u001b[0m \u001b[43m    \u001b[49m\u001b[43mforce_download\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mforce_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    642\u001b[0m \u001b[43m    \u001b[49m\u001b[43mresume_download\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    643\u001b[0m \u001b[43m    \u001b[49m\u001b[43mproxies\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mproxies\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    644\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtoken\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    645\u001b[0m \u001b[43m    \u001b[49m\u001b[43mrevision\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    646\u001b[0m \u001b[43m    \u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    647\u001b[0m \u001b[43m    \u001b[49m\u001b[43msubfolder\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msubfolder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    648\u001b[0m \u001b[43m    \u001b[49m\u001b[43m_raise_exceptions_for_gated_repo\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m    649\u001b[0m \u001b[43m    \u001b[49m\u001b[43m_raise_exceptions_for_missing_entries\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m    650\u001b[0m \u001b[43m    \u001b[49m\u001b[43m_raise_exceptions_for_connection_errors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m    651\u001b[0m \u001b[43m    \u001b[49m\u001b[43m_commit_hash\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcommit_hash\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    652\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    653\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m resolved_config_file \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m    654\u001b[0m     logger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCould not locate the tokenizer configuration file, will try to use the model config instead.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "File \u001b[1;32md:\\Satyam Mishra\\NLP Project\\Text Summarization\\env\\Lib\\site-packages\\transformers\\utils\\hub.py:462\u001b[0m, in \u001b[0;36mcached_file\u001b[1;34m(path_or_repo_id, filename, cache_dir, force_download, resume_download, proxies, token, revision, local_files_only, subfolder, repo_type, user_agent, _raise_exceptions_for_gated_repo, _raise_exceptions_for_missing_entries, _raise_exceptions_for_connection_errors, _commit_hash, **deprecated_kwargs)\u001b[0m\n\u001b[0;32m    460\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mEnvironmentError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThere was a specific connection error when trying to load \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpath_or_repo_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00merr\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m    461\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m HFValidationError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m--> 462\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mEnvironmentError\u001b[39;00m(\n\u001b[0;32m    463\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIncorrect path_or_model_id: \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpath_or_repo_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m. Please provide either the path to a local folder or the repo_id of a model on the Hub.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m    464\u001b[0m     ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n\u001b[0;32m    465\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m resolved_file\n",
+      "\u001b[1;31mOSError\u001b[0m: Incorrect path_or_model_id: 'artifacts/model_trainer/tokenizer'. Please provide either the path to a local folder or the repo_id of a model on the Hub."
+     ]
+    }
+   ],
+   "source": [
+    "try:\n",
+    "    config = ConfigurationManager()\n",
+    "    model_evaluation_config = config.get_model_evaluation_config()\n",
+    "    model_evaluation_config = ModelEvaluation(config=model_evaluation_config)\n",
+    "    model_evaluation_config.evaluation()\n",
+    "except Exception as e:\n",
+    "    raise e"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

research/model_trainer.ipynb ADDED Viewed

	@@ -0,0 +1,239 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.chdir('../')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'d:\\\\Satyam Mishra\\\\NLP Project\\\\Text Summarization'"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%pwd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dataclasses import dataclass\n",
+    "from pathlib import Path\n",
+    "\n",
+    "@dataclass(frozen=True)\n",
+    "class ModelTrainerConfig:\n",
+    "    root_dir: Path\n",
+    "    data_path: Path\n",
+    "    model_ckpt: Path\n",
+    "    num_train_epochs: int\n",
+    "    warmup_steps: int\n",
+    "    per_device_train_batch_size: int\n",
+    "    weight_decay: float\n",
+    "    logging_steps: int\n",
+    "    evaluation_strategy: str\n",
+    "    eval_steps: int\n",
+    "    save_steps: float\n",
+    "    gradient_accumulation_steps: int"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from summarylm.constants import *\n",
+    "from summarylm.utils.common import read_yaml, create_directories"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class ConfigurationManager:\n",
+    "    def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH):\n",
+    "        self.config = read_yaml(config_filepath)\n",
+    "        self.params = read_yaml(params_filepath)\n",
+    "\n",
+    "        create_directories([self.config.artifacts_root])\n",
+    "\n",
+    "    def get_model_trainer_config(self) -> ModelTrainerConfig:\n",
+    "        config = self.config.model_trainer\n",
+    "        params = self.params.TrainingArguments\n",
+    "\n",
+    "        create_directories([config.root_dir])\n",
+    "\n",
+    "        model_trainer_config = ModelTrainerConfig(\n",
+    "            root_dir=config.root_dir,\n",
+    "            data_path=config.data_path,\n",
+    "            model_ckpt=config.model_ckpt,\n",
+    "            num_train_epochs=params.num_train_epochs,\n",
+    "            warmup_steps=params.warmup_steps,\n",
+    "            per_device_train_batch_size=params.per_device_train_batch_size,\n",
+    "            weight_decay=params.weight_decay,\n",
+    "            logging_steps=params.logging_steps,\n",
+    "            evaluation_strategy=params.evaluation_strategy,\n",
+    "            eval_steps=params.eval_steps,\n",
+    "            save_steps=params.save_steps,\n",
+    "            gradient_accumulation_steps=params.gradient_accumulation_steps,\n",
+    "        )\n",
+    "\n",
+    "        return model_trainer_config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import TrainingArguments, Trainer\n",
+    "from transformers import DataCollatorForSeq2Seq\n",
+    "from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
+    "from datasets import load_dataset, load_from_disk\n",
+    "import torch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class ModelTrainer:\n",
+    "    def __init__(self, config: ModelTrainerConfig):\n",
+    "        self.config = config\n",
+    "\n",
+    "    def train(self):\n",
+    "        device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "        tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)\n",
+    "        model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)\n",
+    "        seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)\n",
+    "\n",
+    "        # loading the dataset\n",
+    "        dataset_pt = load_from_disk(self.config.data_path)\n",
+    "\n",
+    "        trainer_args = TrainingArguments(\n",
+    "            output_dir=self.config.root_dir,\n",
+    "            num_train_epochs=self.config.num_train_epochs,\n",
+    "            warmup_steps=self.config.warmup_steps,\n",
+    "            per_device_train_batch_size=self.config.per_device_train_batch_size,\n",
+    "            per_device_eval_batch_size=self.config.per_device_train_batch_size,\n",
+    "            weight_decay=self.config.weight_decay,\n",
+    "            logging_steps=self.config.logging_steps,\n",
+    "            evaluation_strategy=self.config.evaluation_strategy,\n",
+    "            eval_steps=self.config.eval_steps,\n",
+    "            save_steps=self.config.save_steps,\n",
+    "            gradient_accumulation_steps=self.config.gradient_accumulation_steps,\n",
+    "        )\n",
+    "\n",
+    "        trainer = Trainer(\n",
+    "            model=model_pegasus,\n",
+    "            args=trainer_args,\n",
+    "            tokenizer=tokenizer, \n",
+    "            data_collator=seq2seq_data_collator,\n",
+    "            train_dataset=dataset_pt['train'],\n",
+    "            eval_dataset=dataset_pt['validation']\n",
+    "        )\n",
+    "\n",
+    "        trainer.train()\n",
+    "\n",
+    "        ## Save model\n",
+    "        model_pegasus.save_pretrained(os.path.join(self.config.root_dir,\"pegasus-summary-lm\"))\n",
+    "        ## Save tokenizer\n",
+    "        tokenizer.save_pretrained(os.path.join(self.config.root_dir,\"tokenizer\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-05-21 07:37:38,704: INFO: common: Yaml file: config\\config.yaml loaded successfully]\n",
+      "[2024-05-21 07:37:38,721: INFO: common: Yaml file: params.yaml loaded successfully]\n",
+      "[2024-05-21 07:37:38,739: INFO: common: Directory created successfully at: artifacts]\n",
+      "[2024-05-21 07:37:38,742: INFO: common: Directory created successfully at: artifacts/model_trainer]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "d:\\Satyam Mishra\\NLP Project\\Text Summarization\\env\\Lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "try:\n",
+    "    config = ConfigurationManager()\n",
+    "    model_trainer_config = config.get_model_trainer_config()\n",
+    "    model_trainer_config = ModelTrainer(config=model_trainer_config)\n",
+    "    model_trainer_config.train()\n",
+    "except Exception as e:\n",
+    "    raise e"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

setup.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import setuptools
+with open("README.md", "r", encoding="utf-8") as f:
+    long_description = f.read()
+__version__ = "0.0.0"
+AUTHOR_USER_NAME = "Satyam Mishra"
+SRC_REPO = "SummaryLM"
+AUTHOR_EMAIL = "[email protected]"
+setuptools.setup(
+    name=SRC_REPO,
+    version=__version__,
+    author=AUTHOR_USER_NAME,
+    author_email=AUTHOR_EMAIL,
+    description='A text summarizer',
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    package_dir={"": "src"},
+    packages=setuptools.find_packages(where="src"),
+)

src/summarylm/__init__.py ADDED Viewed

File without changes

src/summarylm/components/__init__.py ADDED Viewed

File without changes

src/summarylm/components/data_ingestion.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import os
+import sys
+import zipfile
+from pathlib import Path
+from datasets import load_dataset
+from summarylm.entity import DataIngestionConfig
+from summarylm.utils.common import get_size
+from summarylm.logging import logger
+from summarylm.exception import CustomException
+class DataIngestion:
+    """
+    Class for download and unzip data and store it into artifact folder
+    Args:
+        config (DataIngestionConfig): Contain all config for data ingestion
+    Returns:
+        None
+    """
+    def __init__(self, config: DataIngestionConfig):
+        self.config = config
+    def download_data(self):
+        """
+        Function to download data from gcloud
+        """
+        try:
+            for i in range(len(self.config.LOCAL_DATA_FILE)):
+                if not os.path.exists(self.config.LOCAL_DATA_FILE[i]):
+                    dataset = load_dataset(self.config.ALL_HUGGINGFACE_DATA[i])
+                    dataset.save_to_disk(self.config.LOCAL_DATA_FILE[i])
+                    logger.info(f"{self.config.ALL_HUGGINGFACE_DATA[i]} downloaded!")
+                else:
+                    logger.info(f"File already exists of size: {get_size(Path(self.config.LOCAL_DATA_FILE[i]))}")
+        except Exception as e:
+            raise CustomException(e, sys) from e

src/summarylm/components/data_transformation.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import os
+import sys
+from summarylm.logging import logger
+from summarylm.exception import CustomException
+from summarylm.entity import DataTransformationConfig
+from transformers import AutoTokenizer
+from datasets import load_dataset, load_from_disk, concatenate_datasets, DatasetDict
+class DataTransformation:
+    """
+    Class for transforming data into valid format for training
+    Args:
+        config (DataTransformationConfig): Contain all config for data transformation
+    """
+    def __init__(self, config: DataTransformationConfig):
+        self.config = config
+        self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)
+    def convert_data_into_right_format(self, datasets: list) -> DatasetDict:
+        """
+        Function to remove & rename columns and convert it into right format to train
+        Args:
+            datasets (list): list of all dataset path
+        Returns:
+            DatasetDict: Contains train, test, and validation sets
+        """
+        try:
+            logger.info("Entered convert_data_into_right_format method of DataTransformation class.")
+            # loading all datasets
+            loaded_datasets = {}
+            print("Loading the dataset")
+            for data in datasets:
+                loaded_datasets[data] = load_from_disk(data)
+            dataset1 = loaded_datasets[datasets[0]]
+            dataset2 = loaded_datasets[datasets[1]]
+            print("Dataset loaded")
+            # removing unwanted columns from dataset1
+            dataset1_train = dataset1['train'].select_columns(['article', 'summary'])
+            dataset1_test = dataset1['test'].select_columns(['article', 'summary'])
+            dataset1_validation = dataset1['validation'].select_columns(['article', 'summary'])
+            # renaming data column name of dataset1
+            dataset1_train = dataset1_train.rename_column('article', 'text')
+            dataset1_test = dataset1_test.rename_column('article', 'text')
+            dataset1_validation = dataset1_validation.rename_column('article', 'text')
+            # renaming data column name of dataset2
+            dataset2_train = dataset2['train'].rename_column('document', 'text')
+            dataset2_test = dataset2['test'].rename_column('document', 'text')
+            dataset2_validation = dataset2['validation'].rename_column('document', 'text')
+            # concatenate_datasets
+            dataset_train = concatenate_datasets([dataset1_train, dataset2_train])
+            dataset_test = concatenate_datasets([dataset1_test, dataset2_test])
+            dataset_validation = concatenate_datasets([dataset1_validation, dataset2_validation])
+            # loading the dataset into DatasetDict
+            dataset = DatasetDict({
+                "train": dataset_train,
+                "validation": dataset_validation,
+                "test": dataset_test,
+            })
+            return dataset
+        except Exception as e:
+            raise CustomException(e, sys) from e
+    def convert_examples_to_features(self, example_batch):
+        """
+        Method to convert data into data into features
+        Args:
+            example_batch: dataset after loading it from datasets library
+        Returns:
+            input_ids: A list of token ids representing the dialogue
+            attention_mask: List of indices specifying which tokens should be attended to by the model
+            labels: A list of token ids representing the summary
+        """
+        try:
+            logger.info("Entered convert_examples_to_features method of DataTransformation class.")
+            input_encodings = self.tokenizer(example_batch['text'], max_length = 1024, truncation = True)
+            with self.tokenizer.as_target_tokenizer():
+                target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True)
+            return {
+                'input_ids': input_encodings['input_ids'],
+                'attention_mask': input_encodings['attention_mask'],
+                'labels': target_encodings['input_ids']
+            }
+        except Exception as e:
+            raise CustomException(e, sys) from e
+    def convert(self):
+        data1 = os.path.join(self.config.data_path, self.config.ALL_REQUIRED_DATA[0])
+        data2 = os.path.join(self.config.data_path, self.config.ALL_REQUIRED_DATA[1])
+        dataset = self.convert_data_into_right_format([data1, data2])
+        dataset_pt = dataset.map(self.convert_examples_to_features, batched=True)
+        dataset_pt.save_to_disk(os.path.join(self.config.root_dir, "dataset"))

src/summarylm/components/data_validation.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import os
+import sys
+from summarylm.logging import logger
+from summarylm.exception import CustomException
+from summarylm.entity import DataValidationConfig
+class DataValidation:
+    """
+    Class for validating if all data files exists in train, test, validation folders
+    Args:
+        config (DataValidationConfig): Contain all config for data validation
+    Returns:
+        validation_status (bool): true if data exists else false
+    """
+    def __init__(self, config: DataValidationConfig):
+        self.config = config
+    def validate_all_files_exist(self) -> bool:
+        try:
+            logger.info("Entered validate_all_files_exist method of DataValidation class.")
+            validation_status = None
+            for data in self.config.ALL_REQUIRED_DATA:
+                all_files = os.listdir(os.path.join("artifacts", "data_ingestion", data))
+                for file in all_files:
+                    if file not in self.config.ALL_REQUIRED_FILES:
+                        validation_status = False
+                        with open(self.config.STATUS_FILE, 'w') as f:
+                            f.write(f"Validation status: {validation_status}")
+                    else:
+                        validation_status = True
+                        with open(self.config.STATUS_FILE, 'w') as f:
+                            f.write(f"Validation status: {validation_status}")
+            logger.info("Completed validate_all_files_exist method of DataValidation class.")
+            return validation_status
+        except Exception as e:
+            raise CustomException(e, sys) from e

src/summarylm/components/model_evaluation.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from summarylm.entity import ModelEvaluationConfig
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+from datasets import load_dataset, load_from_disk, load_metric
+import torch
+import pandas as pd
+from tqdm import tqdm
+class ModelEvaluation:
+    def __init__(self, config: ModelEvaluationConfig):
+        self.config = config
+    def generate_batch_size_chunks(self, list_of_elements, batch_size):
+        """
+        Split the dataset into smaller batches that we can process simultaneously
+        Yield successive batch-sized chunks from list_of_elements.
+        """
+        for i in range(0, len(list_of_elements), batch_size):
+            yield list_of_elements[i : i + batch_size]
+    def calculate_metric_on_test_ds(self, dataset, metric, model, tokenizer, batch_size=16,
+                                device="cuda" if torch.cuda.is_available() else "cpu",
+                                column_text="article",
+                                column_summary="highlights"):
+        article_batches = list(self.generate_batch_size_chunks(dataset[column_text], batch_size))
+        target_batches = list(self.generate_batch_size_chunks(dataset[column_summary], batch_size))
+        for article_batch, target_batch in tqdm(
+            zip(article_batches, target_batches), total=len(article_batches)):
+            inputs = tokenizer(article_batch, max_length=1024,  truncation=True,
+                            padding="max_length", return_tensors="pt")
+            summaries = model.generate(input_ids=inputs["input_ids"].to(device),
+                            attention_mask=inputs["attention_mask"].to(device),
+                            length_penalty=0.8, num_beams=8, max_length=128)
+            ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''
+            # Finally, we decode the generated texts,
+            # replace the  token, and add the decoded texts with the references to the metric.
+            decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
+                                    clean_up_tokenization_spaces=True) for s in summaries]
+            decoded_summaries = [d.replace("", " ") for d in decoded_summaries]
+            metric.add_batch(predictions=decoded_summaries, references=target_batch)
+        #  Finally compute and return the ROUGE scores.
+        score = metric.compute()
+        return score
+    def evaluation(self):
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
+        model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path).to(device)
+        # loading data
+        dataset_pt = load_from_disk(self.config.data_path)
+        rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
+        rouge_metric = load_metric('rouge')
+        score = self.calculate_metric_on_test_ds(dataset_pt['test'][0:10], rouge_metric, model_pegasus, tokenizer, batch_size = 2, column_text = 'text', column_summary= 'summary')
+        rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )
+        df = pd.DataFrame(rouge_dict, index = ['pegasus'] )
+        df.to_csv(self.config.metric_file_name, index=False)

src/summarylm/components/model_trainer.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import os
+import sys
+from summarylm.entity import ModelTrainerConfig
+from summarylm.logging import logger
+from summarylm.exception import CustomException
+from transformers import TrainingArguments, Trainer
+from transformers import DataCollatorForSeq2Seq
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+from datasets import load_dataset, load_from_disk
+import torch
+class ModelTrainer:
+    """
+    Class for training model
+    Args:
+        config (ModelTrainerConfig): Contain all config for model training
+    """
+    def __init__(self, config: ModelTrainerConfig):
+        self.config = config
+    def train(self):
+        """
+        Method to train pegasus model
+        """
+        logger.info("Entered train method of ModelTrainer class.")
+        try:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
+            model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
+            seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)
+            # loading the dataset
+            dataset_pt = load_from_disk(self.config.data_path)
+            trainer_args = TrainingArguments(
+                output_dir=self.config.root_dir,
+                num_train_epochs=self.config.num_train_epochs,
+                warmup_steps=self.config.warmup_steps,
+                per_device_train_batch_size=self.config.per_device_train_batch_size,
+                per_device_eval_batch_size=self.config.per_device_train_batch_size,
+                weight_decay=self.config.weight_decay,
+                logging_steps=self.config.logging_steps,
+                evaluation_strategy=self.config.evaluation_strategy,
+                eval_steps=self.config.eval_steps,
+                save_steps=1e6,
+                gradient_accumulation_steps=self.config.gradient_accumulation_steps,
+            )
+            trainer = Trainer(
+                model=model_pegasus,
+                args=trainer_args,
+                tokenizer=tokenizer,
+                data_collator=seq2seq_data_collator,
+                train_dataset=dataset_pt['train'],
+                eval_dataset=dataset_pt['validation']
+            )
+            trainer.train()
+            ## Save model
+            model_pegasus.save_pretrained(os.path.join(self.config.root_dir,"pegasus-summary-lm"))
+            ## Save tokenizer
+            tokenizer.save_pretrained(os.path.join(self.config.root_dir,"tokenizer"))
+            logger.info("Completed trian method of ModelTrainer class.")
+        except Exception as e:
+            raise CustomException(e, sys) from e

src/summarylm/config/__init__.py ADDED Viewed

File without changes

src/summarylm/config/configuration.py ADDED Viewed

	@@ -0,0 +1,100 @@

+from summarylm.constants import *
+from summarylm.utils.common import read_yaml, create_directories
+from summarylm.entity import (DataIngestionConfig, DataValidationConfig, DataTransformationConfig, ModelTrainerConfig, ModelEvaluationConfig)
+class ConfigurationManager:
+    """
+    Configuration Manager for Data Ingestion, Data Transformation
+    Args:
+        config_filepath (Path): Path to config yaml file
+        params_filepath (Path): Path to params yaml file
+    Retuns:
+        None
+    """
+    def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH) -> None:
+        self.config = read_yaml(config_filepath)
+        self.params = read_yaml(params_filepath)
+        create_directories([self.config.artifacts_root])
+    def get_data_ingestion_config(self) -> DataIngestionConfig:
+        config = self.config.data_ingestion
+        create_directories([config.root_dir])
+        data_ingestion_config = DataIngestionConfig(
+            root_dir=config.root_dir,
+            ALL_HUGGINGFACE_DATA=config.ALL_HUGGINGFACE_DATA,
+            LOCAL_DATA_FILE = config.LOCAL_DATA_FILE,
+        )
+        return data_ingestion_config
+    def get_data_validation_config(self) -> DataValidationConfig:
+        config = self.config.data_validation
+        create_directories([config.root_dir])
+        data_validation_config = DataValidationConfig(
+            root_dir=config.root_dir,
+            STATUS_FILE=config.STATUS_FILE,
+            ALL_REQUIRED_DATA=config.ALL_REQUIRED_DATA,
+            ALL_REQUIRED_FILES=config.ALL_REQUIRED_FILES,
+        )
+        return data_validation_config
+    def get_data_transformation_config(self) -> DataTransformationConfig:
+        config = self.config.data_transformation
+        create_directories([config.root_dir])
+        data_transformation_config = DataTransformationConfig(
+            root_dir=config.root_dir,
+            data_path=config.data_path,
+            ALL_REQUIRED_DATA=config.ALL_REQUIRED_DATA,
+            tokenizer_name=config.tokenizer_name
+        )
+        return data_transformation_config
+    def get_model_trainer_config(self) -> ModelTrainerConfig:
+        config = self.config.model_trainer
+        params = self.params.TrainingArguments
+        create_directories([config.root_dir])
+        model_trainer_config = ModelTrainerConfig(
+            root_dir=config.root_dir,
+            data_path=config.data_path,
+            model_ckpt=config.model_ckpt,
+            num_train_epochs=params.num_train_epochs,
+            warmup_steps=params.warmup_steps,
+            per_device_train_batch_size=params.per_device_train_batch_size,
+            weight_decay=params.weight_decay,
+            logging_steps=params.logging_steps,
+            evaluation_strategy=params.evaluation_strategy,
+            eval_steps=params.eval_steps,
+            save_steps=params.save_steps,
+            gradient_accumulation_steps=params.gradient_accumulation_steps,
+        )
+        return model_trainer_config
+    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
+        config = self.config.model_evaluation
+        create_directories([config.root_dir])
+        model_evaluation_config = ModelEvaluationConfig(
+            root_dir=config.root_dir,
+            data_path=config.data_path,
+            model_path=config.model_path,
+            tokenizer_path=config.tokenizer_path,
+            metric_file_name=config.metric_file_name,
+        )
+        return model_evaluation_config

src/summarylm/config/gcloud_syncer.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import os
+class GCloudSync:
+    def sync_folder_to_gcloud(self, gcp_bucket_url, filepath, filename):
+        """
+        Function to sync files from local machine to Google Cloud Storage
+        Args:
+            gcp_bucket_url (str): Google Cloud Storage bucket URL
+            filepath (str): Local file path
+            filename (str): Local file name
+        """
+        command = f"gsutil cp {filename}/{filepath} gs://{gcp_bucket_url}"
+        os.system(command)
+    def sync_folder_from_gcloud(self, gcp_bucket_url, filename, destination):
+        """
+        Function to sync the folder from the gclooud to local machine
+        Args:
+            gcp_bucket_url (str): Google Cloud Storage bucket URL
+            filename (str): Local file name
+            destination (str): Local file path
+        """
+        command = f"gsutil cp gs://{gcp_bucket_url}/{filename} {destination}/{filename}"
+        os.system(command)

src/summarylm/constants/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from pathlib import Path
+CONFIG_FILE_PATH = Path("config/config.yaml")
+PARAMS_FILE_PATH = Path("params.yaml")

src/summarylm/entity/__init__.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from dataclasses import dataclass
+from pathlib import Path
+@dataclass(frozen=True)
+class DataIngestionConfig:
+    root_dir: Path
+    ALL_HUGGINGFACE_DATA: list
+    LOCAL_DATA_FILE: list
+@dataclass(frozen=True)
+class DataValidationConfig:
+    root_dir: Path
+    STATUS_FILE: str
+    ALL_REQUIRED_DATA: list
+    ALL_REQUIRED_FILES: list
+@dataclass(frozen=True)
+class DataTransformationConfig:
+    root_dir: Path
+    data_path: Path
+    ALL_REQUIRED_DATA: Path
+    tokenizer_name: Path
+@dataclass(frozen=True)
+class ModelTrainerConfig:
+    root_dir: Path
+    data_path: Path
+    model_ckpt: Path
+    num_train_epochs: int
+    warmup_steps: int
+    per_device_train_batch_size: int
+    weight_decay: float
+    logging_steps: int
+    evaluation_strategy: str
+    eval_steps: int
+    save_steps: float
+    gradient_accumulation_steps: int
+@dataclass(frozen=True)
+class ModelEvaluationConfig:
+    root_dir: Path
+    data_path: Path
+    model_path: Path
+    tokenizer_path: Path
+    metric_file_name: Path

src/summarylm/exception/__init__.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import os
+import sys
+from summarylm.logging import logger
+def error_message_detail(error, error_detail):
+    """
+    Retruns the error message and error details and logs the error
+    Args:
+        error: error message
+        error_detail: error details
+    Returns:
+        error_message: error message
+    """
+    _, _, exe_tb = error_detail.exc_info()
+    file_name = exe_tb.tb_frame.f_code.co_filename
+    line_number = exe_tb.tb_lineno
+    error_message = "Error occured in file called [{0}] line number: [{1}] error message: [{2}]".format(
+        file_name, line_number, str(error)
+    )
+    logger.info(error_message)
+    return error_message
+class CustomException(Exception):
+    def __init__(self, error_message, error_detail):
+        super().__init__(error_message)
+        self.error_message = error_message_detail(error_message, error_detail=error_detail)
+    def __str__(self):
+        return self.error_message

src/summarylm/logging/__init__.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import logging
+import os
+from datetime import datetime
+"""
+Logging Every error and in logging file that is in the logs directory.
+"""
+LOG_FILE = f"running_logs.log"
+logs_path = os.path.join(os.getcwd(), "logs")
+os.makedirs(logs_path, exist_ok=True)
+LOG_FILE_PATH = os.path.join(logs_path, LOG_FILE)
+logging.basicConfig(
+    filename=LOG_FILE_PATH,
+    format="[ %(asctime)s ] %(lineno)d %(name)s - %(levelname)s - %(message)s",
+    level=logging.INFO,
+)
+logger = logging.getLogger("textSummarizerLogger")

src/summarylm/pipeline/__init__.py ADDED Viewed

File without changes

src/summarylm/pipeline/data_ingestion.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import sys
+from summarylm.config.configuration import ConfigurationManager
+from summarylm.components.data_ingestion import DataIngestion
+from summarylm.logging import logger
+from summarylm.exception import CustomException
+class DataIngestionPipeline:
+    """
+    Pipeline for data ingestion
+    """
+    def __init__(self) -> None:
+        pass
+    def main(self):
+        try:
+            config = ConfigurationManager()
+            data_ingestion_config = config.get_data_ingestion_config()
+            data_ingestion = DataIngestion(config=data_ingestion_config)
+            data_ingestion.download_data()
+        except Exception as e:
+            raise CustomException(e, sys) from e

src/summarylm/pipeline/data_transformation.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import sys
+from summarylm.config.configuration import ConfigurationManager
+from summarylm.components.data_transformation import DataTransformation
+from summarylm.logging import logger
+from summarylm.exception import CustomException
+class DataTransformationPipeline:
+    """
+    Pipeline for data transformation to convert data into right format
+    """
+    def __init__(self) -> None:
+        pass
+    def main(self):
+        try:
+            config = ConfigurationManager()
+            data_transformation_config = config.get_data_transformation_config()
+            data_transformation = DataTransformation(config=data_transformation_config)
+            data_transformation.convert()
+        except Exception as e:
+            raise CustomException(e, sys) from e

src/summarylm/pipeline/data_validation.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import sys
+from summarylm.config.configuration import ConfigurationManager
+from summarylm.components.data_validation import DataValidation
+from summarylm.logging import logger
+from summarylm.exception import CustomException
+class DataValidationPipeline:
+    """
+    Pipeline for validating if data exists
+    """
+    def __init__(self) -> None:
+        pass
+    def main(self):
+        try:
+            config = ConfigurationManager()
+            data_validation_config = config.get_data_validation_config()
+            data_validation = DataValidation(config=data_validation_config)
+            data_validation.validate_all_files_exist()
+        except Exception as e:
+            raise CustomException(e, sys) from e

src/summarylm/pipeline/model_evaluation.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import sys
+from summarylm.config.configuration import ConfigurationManager
+from summarylm.components.model_evaluation import ModelEvaluation
+from summarylm.logging import logger
+from summarylm.exception import CustomException
+class ModelEvaluationPipeline:
+    """
+    Pipeline for pegasus model evaluation
+    """
+    def __init__(self) -> None:
+        pass
+    def main(self):
+        try:
+            config = ConfigurationManager()
+            model_evaluation_config = config.get_model_evaluation_config()
+            model_evaluation_config = ModelEvaluation(config=model_evaluation_config)
+            model_evaluation_config.evaluation()
+        except Exception as e:
+            raise CustomException(e, sys) from e

src/summarylm/pipeline/model_trainer.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import sys
+from summarylm.config.configuration import ConfigurationManager
+from summarylm.components.model_trainer import ModelTrainer
+from summarylm.logging import logger
+from summarylm.exception import CustomException
+class ModelTrainerPipeline:
+    """
+    Pipeline for training pegasus model
+    """
+    def __init__(self) -> None:
+        pass
+    def main(self):
+        try:
+            config = ConfigurationManager()
+            model_trainer_config = config.get_model_trainer_config()
+            model_trainer_config = ModelTrainer(config=model_trainer_config)
+            model_trainer_config.train()
+        except Exception as e:
+            raise CustomException(e, sys) from e

src/summarylm/pipeline/prediction.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from summarylm.config.configuration import ConfigurationManager
+from transformers import AutoTokenizer
+from transformers import pipeline
+class PredictionPipeline:
+    def __init__(self):
+        self.config = ConfigurationManager().get_model_evaluation_config()
+    def predict(self, text, max_length):
+        tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
+        gen_kwargs = {"length_penalty": 0.8, "num_beams": 8, "max_length": max_length}
+        pipe = pipeline("summarization", model=self.config.model_path, tokenizer=tokenizer)
+        output = pipe(text, **gen_kwargs)[0]["summary_text"]
+        return output

src/summarylm/utils/__init__.py ADDED Viewed

File without changes

src/summarylm/utils/common.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import os
+import sys
+from box.exceptions import BoxValueError
+from summarylm.exception import CustomException
+import yaml
+from summarylm.logging import logger
+from ensure import ensure_annotations
+from box import ConfigBox
+from pathlib import Path
+from typing import Any
+@ensure_annotations
+def read_yaml(path_to_yaml: Path) -> ConfigBox:
+    """
+    Reading yam file and returns
+    Args:
+        path_to_yaml (str): path like input
+    Raises:
+        ValueError: if yaml file is empty
+    Returns:
+        ConfigBox: ConfigBox type
+    """
+    try:
+        with open(path_to_yaml) as yaml_file:
+            content = yaml.safe_load(yaml_file)
+            logger.info(f"Yaml file: {path_to_yaml} loaded successfully")
+            return ConfigBox(content)
+    except BoxValueError:
+        raise ValueError("yaml file is empty")
+    except Exception as e:
+        raise CustomException(e, sys) from e
+@ensure_annotations
+def create_directories(path_to_directories: list, verbose=True):
+    """
+    Create list of directories
+    Args:
+        path_to_directories (list): list of path of directories
+        ignore_log (bool, optional): ignore if multiple dirs is to be created. Defaults to False
+    """
+    for path in path_to_directories:
+        os.makedirs(path, exist_ok=True)
+        if verbose:
+            logger.info(f"Directory created successfully at: {path}")
+@ensure_annotations
+def get_size(path: Path) -> str:
+    """
+    Get size in KB
+    Args:
+        path (Path): path of the file
+    Returns:
+        str: size in KB
+    """
+    size_in_kb = round(os.path.getsize(path)/1024)
+    return f"~{size_in_kb} KB"

template.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os
+from pathlib import Path
+import logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+project_name = "summarylm"
+list_of_file = [
+    ".github/workflows/.gitkeep",
+    f"src/{project_name}/__init__.py",
+    f"src/{project_name}/components/__init__.py",
+    f"src/{project_name}/components/data_ingestion.py",
+    f"src/{project_name}/components/data_transformation.py",
+    f"src/{project_name}/components/data_validation.py",
+    f"src/{project_name}/components/model_evaluation.py",
+    f"src/{project_name}/components/model_trainer.py",
+    f"src/{project_name}/utils/__init__.py",
+    f"src/{project_name}/utils/common.py",
+    f"src/{project_name}/logging/__init__.py",
+    f"src/{project_name}/exception/__init__.py",
+    f"src/{project_name}/config/__init__.py",
+    f"src/{project_name}/config/configuration.py",
+    f"src/{project_name}/config/gcloud_syncer.py",
+    f"src/{project_name}/pipeline/__init__.py",
+    f"src/{project_name}/pipeline/data_ingestion.py",
+    f"src/{project_name}/entity/__init__.py",
+    f"src/{project_name}/constants/__init__.py",
+    "config/config.yaml",
+    "params.yaml",
+    "app.py",
+    "main.py",
+    "Dockerfile",
+    "requirements.txt",
+    "setup.py",
+    "research/experiment.ipynb",
+]
+for filepath in list_of_file:
+    filepath = Path(filepath)
+    filedir, filename = os.path.split(filepath)
+    if filedir != "":
+        os.makedirs(filedir, exist_ok=True)
+        logging.info(f"Creating directory: {filedir} for the file {filename}")
+    if (not os.path.exists(filepath)) or (os.path.getsize(filepath) == 0):
+        with open(filepath, 'w') as f:
+            pass
+            logging.info(f"Creating empty file: {filepath}")
+    else:
+        logging.info(f"{filename} is already exists")