Spaces:
Sleeping
Sleeping
Commit
·
95576a3
0
Parent(s):
Initial commit
Browse files- .github/workflows/main.yml +34 -0
- .gitignore +162 -0
- Dockerfile +17 -0
- LICENSE +201 -0
- README.md +11 -0
- app.py +36 -0
- config/config.yaml +34 -0
- main.py +66 -0
- params.yaml +10 -0
- requirements.txt +23 -0
- research/data_ingestion.ipynb +183 -0
- research/data_transformation.ipynb +234 -0
- research/data_validation.ipynb +197 -0
- research/experiment.ipynb +74 -0
- research/model_evaluation.ipynb +265 -0
- research/model_trainer.ipynb +239 -0
- setup.py +22 -0
- src/summarylm/__init__.py +0 -0
- src/summarylm/components/__init__.py +0 -0
- src/summarylm/components/data_ingestion.py +39 -0
- src/summarylm/components/data_transformation.py +107 -0
- src/summarylm/components/data_validation.py +45 -0
- src/summarylm/components/model_evaluation.py +69 -0
- src/summarylm/components/model_trainer.py +69 -0
- src/summarylm/config/__init__.py +0 -0
- src/summarylm/config/configuration.py +100 -0
- src/summarylm/config/gcloud_syncer.py +34 -0
- src/summarylm/constants/__init__.py +4 -0
- src/summarylm/entity/__init__.py +45 -0
- src/summarylm/exception/__init__.py +34 -0
- src/summarylm/logging/__init__.py +22 -0
- src/summarylm/pipeline/__init__.py +0 -0
- src/summarylm/pipeline/data_ingestion.py +22 -0
- src/summarylm/pipeline/data_transformation.py +22 -0
- src/summarylm/pipeline/data_validation.py +22 -0
- src/summarylm/pipeline/model_evaluation.py +22 -0
- src/summarylm/pipeline/model_trainer.py +22 -0
- src/summarylm/pipeline/prediction.py +17 -0
- src/summarylm/utils/__init__.py +0 -0
- src/summarylm/utils/common.py +65 -0
- template.py +53 -0
.github/workflows/main.yml
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Sync to Hugging Face hub
|
| 2 |
+
on:
|
| 3 |
+
push:
|
| 4 |
+
branches: [main]
|
| 5 |
+
|
| 6 |
+
# To run this workflow manually from the Actions tab
|
| 7 |
+
workflow_dispatch:
|
| 8 |
+
|
| 9 |
+
jobs:
|
| 10 |
+
sync-to-hub:
|
| 11 |
+
runs-on: ubuntu-latest
|
| 12 |
+
steps:
|
| 13 |
+
- uses: actions/checkout@v3
|
| 14 |
+
with:
|
| 15 |
+
fetch-depth: 0
|
| 16 |
+
lfs: true
|
| 17 |
+
|
| 18 |
+
- name: Set up Git user
|
| 19 |
+
run: |
|
| 20 |
+
git config --global user.email "[email protected]"
|
| 21 |
+
git config --global user.name "satyam998"
|
| 22 |
+
|
| 23 |
+
- name: Create a new branch
|
| 24 |
+
run: |
|
| 25 |
+
git checkout --orphan temp
|
| 26 |
+
git add -A
|
| 27 |
+
git commit -m "Initial commit"
|
| 28 |
+
git branch -D main
|
| 29 |
+
git branch -m main
|
| 30 |
+
|
| 31 |
+
- name: Force push to hub
|
| 32 |
+
env:
|
| 33 |
+
HF: ${{ secrets.HG }}
|
| 34 |
+
run: git push --force https://satyam998:[email protected]/spaces/satyam998/pegasus-summary-lm main
|
.gitignore
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
share/python-wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
MANIFEST
|
| 28 |
+
|
| 29 |
+
# PyInstaller
|
| 30 |
+
# Usually these files are written by a python script from a template
|
| 31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 32 |
+
*.manifest
|
| 33 |
+
*.spec
|
| 34 |
+
|
| 35 |
+
# Installer logs
|
| 36 |
+
pip-log.txt
|
| 37 |
+
pip-delete-this-directory.txt
|
| 38 |
+
|
| 39 |
+
# Unit test / coverage reports
|
| 40 |
+
htmlcov/
|
| 41 |
+
.tox/
|
| 42 |
+
.nox/
|
| 43 |
+
.coverage
|
| 44 |
+
.coverage.*
|
| 45 |
+
.cache
|
| 46 |
+
nosetests.xml
|
| 47 |
+
coverage.xml
|
| 48 |
+
*.cover
|
| 49 |
+
*.py,cover
|
| 50 |
+
.hypothesis/
|
| 51 |
+
.pytest_cache/
|
| 52 |
+
cover/
|
| 53 |
+
artifacts/
|
| 54 |
+
|
| 55 |
+
# Translations
|
| 56 |
+
*.mo
|
| 57 |
+
*.pot
|
| 58 |
+
|
| 59 |
+
# Django stuff:
|
| 60 |
+
*.log
|
| 61 |
+
local_settings.py
|
| 62 |
+
db.sqlite3
|
| 63 |
+
db.sqlite3-journal
|
| 64 |
+
|
| 65 |
+
# Flask stuff:
|
| 66 |
+
instance/
|
| 67 |
+
.webassets-cache
|
| 68 |
+
|
| 69 |
+
# Scrapy stuff:
|
| 70 |
+
.scrapy
|
| 71 |
+
|
| 72 |
+
# Sphinx documentation
|
| 73 |
+
docs/_build/
|
| 74 |
+
|
| 75 |
+
# PyBuilder
|
| 76 |
+
.pybuilder/
|
| 77 |
+
target/
|
| 78 |
+
|
| 79 |
+
# Jupyter Notebook
|
| 80 |
+
.ipynb_checkpoints
|
| 81 |
+
|
| 82 |
+
# IPython
|
| 83 |
+
profile_default/
|
| 84 |
+
ipython_config.py
|
| 85 |
+
|
| 86 |
+
# pyenv
|
| 87 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 88 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 89 |
+
# .python-version
|
| 90 |
+
|
| 91 |
+
# pipenv
|
| 92 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 93 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 94 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 95 |
+
# install all needed dependencies.
|
| 96 |
+
#Pipfile.lock
|
| 97 |
+
|
| 98 |
+
# poetry
|
| 99 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 100 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 101 |
+
# commonly ignored for libraries.
|
| 102 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 103 |
+
#poetry.lock
|
| 104 |
+
|
| 105 |
+
# pdm
|
| 106 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 107 |
+
#pdm.lock
|
| 108 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
| 109 |
+
# in version control.
|
| 110 |
+
# https://pdm.fming.dev/#use-with-ide
|
| 111 |
+
.pdm.toml
|
| 112 |
+
|
| 113 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 114 |
+
__pypackages__/
|
| 115 |
+
|
| 116 |
+
# Celery stuff
|
| 117 |
+
celerybeat-schedule
|
| 118 |
+
celerybeat.pid
|
| 119 |
+
|
| 120 |
+
# SageMath parsed files
|
| 121 |
+
*.sage.py
|
| 122 |
+
|
| 123 |
+
# Environments
|
| 124 |
+
.env
|
| 125 |
+
.venv
|
| 126 |
+
env/
|
| 127 |
+
venv/
|
| 128 |
+
ENV/
|
| 129 |
+
env.bak/
|
| 130 |
+
venv.bak/
|
| 131 |
+
|
| 132 |
+
# Spyder project settings
|
| 133 |
+
.spyderproject
|
| 134 |
+
.spyproject
|
| 135 |
+
|
| 136 |
+
# Rope project settings
|
| 137 |
+
.ropeproject
|
| 138 |
+
|
| 139 |
+
# mkdocs documentation
|
| 140 |
+
/site
|
| 141 |
+
|
| 142 |
+
# mypy
|
| 143 |
+
.mypy_cache/
|
| 144 |
+
.dmypy.json
|
| 145 |
+
dmypy.json
|
| 146 |
+
|
| 147 |
+
# Pyre type checker
|
| 148 |
+
.pyre/
|
| 149 |
+
|
| 150 |
+
# pytype static type analyzer
|
| 151 |
+
.pytype/
|
| 152 |
+
|
| 153 |
+
# Cython debug symbols
|
| 154 |
+
cython_debug/
|
| 155 |
+
|
| 156 |
+
# PyCharm
|
| 157 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 158 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 159 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 160 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 161 |
+
#.idea/
|
| 162 |
+
"artifacts/"
|
Dockerfile
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.9
|
| 2 |
+
|
| 3 |
+
RUN useradd -m -u 1000 user
|
| 4 |
+
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
COPY --chown=user . /app
|
| 8 |
+
|
| 9 |
+
RUN pip install -r requirements.txt
|
| 10 |
+
RUN pip install --upgrade accelerate
|
| 11 |
+
RUN pip uninstall -y transformers accelerate
|
| 12 |
+
RUN pip install transformers accelerate
|
| 13 |
+
|
| 14 |
+
RUN mkdir -p /app/logs
|
| 15 |
+
RUN chmod 777 /app/logs
|
| 16 |
+
|
| 17 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
LICENSE
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Apache License
|
| 2 |
+
Version 2.0, January 2004
|
| 3 |
+
http://www.apache.org/licenses/
|
| 4 |
+
|
| 5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
| 6 |
+
|
| 7 |
+
1. Definitions.
|
| 8 |
+
|
| 9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
| 10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
| 11 |
+
|
| 12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
| 13 |
+
the copyright owner that is granting the License.
|
| 14 |
+
|
| 15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
| 16 |
+
other entities that control, are controlled by, or are under common
|
| 17 |
+
control with that entity. For the purposes of this definition,
|
| 18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
| 19 |
+
direction or management of such entity, whether by contract or
|
| 20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
| 21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
| 22 |
+
|
| 23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
| 24 |
+
exercising permissions granted by this License.
|
| 25 |
+
|
| 26 |
+
"Source" form shall mean the preferred form for making modifications,
|
| 27 |
+
including but not limited to software source code, documentation
|
| 28 |
+
source, and configuration files.
|
| 29 |
+
|
| 30 |
+
"Object" form shall mean any form resulting from mechanical
|
| 31 |
+
transformation or translation of a Source form, including but
|
| 32 |
+
not limited to compiled object code, generated documentation,
|
| 33 |
+
and conversions to other media types.
|
| 34 |
+
|
| 35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
| 36 |
+
Object form, made available under the License, as indicated by a
|
| 37 |
+
copyright notice that is included in or attached to the work
|
| 38 |
+
(an example is provided in the Appendix below).
|
| 39 |
+
|
| 40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
| 41 |
+
form, that is based on (or derived from) the Work and for which the
|
| 42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
| 43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
| 44 |
+
of this License, Derivative Works shall not include works that remain
|
| 45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
| 46 |
+
the Work and Derivative Works thereof.
|
| 47 |
+
|
| 48 |
+
"Contribution" shall mean any work of authorship, including
|
| 49 |
+
the original version of the Work and any modifications or additions
|
| 50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
| 51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
| 52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
| 53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
| 54 |
+
means any form of electronic, verbal, or written communication sent
|
| 55 |
+
to the Licensor or its representatives, including but not limited to
|
| 56 |
+
communication on electronic mailing lists, source code control systems,
|
| 57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
| 58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
| 59 |
+
excluding communication that is conspicuously marked or otherwise
|
| 60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
| 61 |
+
|
| 62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
| 63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
| 64 |
+
subsequently incorporated within the Work.
|
| 65 |
+
|
| 66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
| 67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
| 70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
| 71 |
+
Work and such Derivative Works in Source or Object form.
|
| 72 |
+
|
| 73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
| 74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 76 |
+
(except as stated in this section) patent license to make, have made,
|
| 77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
| 78 |
+
where such license applies only to those patent claims licensable
|
| 79 |
+
by such Contributor that are necessarily infringed by their
|
| 80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
| 81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
| 82 |
+
institute patent litigation against any entity (including a
|
| 83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
| 84 |
+
or a Contribution incorporated within the Work constitutes direct
|
| 85 |
+
or contributory patent infringement, then any patent licenses
|
| 86 |
+
granted to You under this License for that Work shall terminate
|
| 87 |
+
as of the date such litigation is filed.
|
| 88 |
+
|
| 89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
| 90 |
+
Work or Derivative Works thereof in any medium, with or without
|
| 91 |
+
modifications, and in Source or Object form, provided that You
|
| 92 |
+
meet the following conditions:
|
| 93 |
+
|
| 94 |
+
(a) You must give any other recipients of the Work or
|
| 95 |
+
Derivative Works a copy of this License; and
|
| 96 |
+
|
| 97 |
+
(b) You must cause any modified files to carry prominent notices
|
| 98 |
+
stating that You changed the files; and
|
| 99 |
+
|
| 100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
| 101 |
+
that You distribute, all copyright, patent, trademark, and
|
| 102 |
+
attribution notices from the Source form of the Work,
|
| 103 |
+
excluding those notices that do not pertain to any part of
|
| 104 |
+
the Derivative Works; and
|
| 105 |
+
|
| 106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
| 107 |
+
distribution, then any Derivative Works that You distribute must
|
| 108 |
+
include a readable copy of the attribution notices contained
|
| 109 |
+
within such NOTICE file, excluding those notices that do not
|
| 110 |
+
pertain to any part of the Derivative Works, in at least one
|
| 111 |
+
of the following places: within a NOTICE text file distributed
|
| 112 |
+
as part of the Derivative Works; within the Source form or
|
| 113 |
+
documentation, if provided along with the Derivative Works; or,
|
| 114 |
+
within a display generated by the Derivative Works, if and
|
| 115 |
+
wherever such third-party notices normally appear. The contents
|
| 116 |
+
of the NOTICE file are for informational purposes only and
|
| 117 |
+
do not modify the License. You may add Your own attribution
|
| 118 |
+
notices within Derivative Works that You distribute, alongside
|
| 119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
| 120 |
+
that such additional attribution notices cannot be construed
|
| 121 |
+
as modifying the License.
|
| 122 |
+
|
| 123 |
+
You may add Your own copyright statement to Your modifications and
|
| 124 |
+
may provide additional or different license terms and conditions
|
| 125 |
+
for use, reproduction, or distribution of Your modifications, or
|
| 126 |
+
for any such Derivative Works as a whole, provided Your use,
|
| 127 |
+
reproduction, and distribution of the Work otherwise complies with
|
| 128 |
+
the conditions stated in this License.
|
| 129 |
+
|
| 130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
| 131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
| 132 |
+
by You to the Licensor shall be under the terms and conditions of
|
| 133 |
+
this License, without any additional terms or conditions.
|
| 134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
| 135 |
+
the terms of any separate license agreement you may have executed
|
| 136 |
+
with Licensor regarding such Contributions.
|
| 137 |
+
|
| 138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
| 139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
| 140 |
+
except as required for reasonable and customary use in describing the
|
| 141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
| 142 |
+
|
| 143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
| 144 |
+
agreed to in writing, Licensor provides the Work (and each
|
| 145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
| 146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 147 |
+
implied, including, without limitation, any warranties or conditions
|
| 148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
| 149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
| 150 |
+
appropriateness of using or redistributing the Work and assume any
|
| 151 |
+
risks associated with Your exercise of permissions under this License.
|
| 152 |
+
|
| 153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
| 154 |
+
whether in tort (including negligence), contract, or otherwise,
|
| 155 |
+
unless required by applicable law (such as deliberate and grossly
|
| 156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
| 157 |
+
liable to You for damages, including any direct, indirect, special,
|
| 158 |
+
incidental, or consequential damages of any character arising as a
|
| 159 |
+
result of this License or out of the use or inability to use the
|
| 160 |
+
Work (including but not limited to damages for loss of goodwill,
|
| 161 |
+
work stoppage, computer failure or malfunction, or any and all
|
| 162 |
+
other commercial damages or losses), even if such Contributor
|
| 163 |
+
has been advised of the possibility of such damages.
|
| 164 |
+
|
| 165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
| 166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
| 167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
| 168 |
+
or other liability obligations and/or rights consistent with this
|
| 169 |
+
License. However, in accepting such obligations, You may act only
|
| 170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
| 171 |
+
of any other Contributor, and only if You agree to indemnify,
|
| 172 |
+
defend, and hold each Contributor harmless for any liability
|
| 173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
| 174 |
+
of your accepting any such warranty or additional liability.
|
| 175 |
+
|
| 176 |
+
END OF TERMS AND CONDITIONS
|
| 177 |
+
|
| 178 |
+
APPENDIX: How to apply the Apache License to your work.
|
| 179 |
+
|
| 180 |
+
To apply the Apache License to your work, attach the following
|
| 181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
| 182 |
+
replaced with your own identifying information. (Don't include
|
| 183 |
+
the brackets!) The text should be enclosed in the appropriate
|
| 184 |
+
comment syntax for the file format. We also recommend that a
|
| 185 |
+
file or class name and description of purpose be included on the
|
| 186 |
+
same "printed page" as the copyright notice for easier
|
| 187 |
+
identification within third-party archives.
|
| 188 |
+
|
| 189 |
+
Copyright [yyyy] [name of copyright owner]
|
| 190 |
+
|
| 191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
| 192 |
+
you may not use this file except in compliance with the License.
|
| 193 |
+
You may obtain a copy of the License at
|
| 194 |
+
|
| 195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 196 |
+
|
| 197 |
+
Unless required by applicable law or agreed to in writing, software
|
| 198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
| 199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 200 |
+
See the License for the specific language governing permissions and
|
| 201 |
+
limitations under the License.
|
README.md
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Pegasus Summary Lm
|
| 3 |
+
emoji: 🔥
|
| 4 |
+
colorFrom: green
|
| 5 |
+
colorTo: pink
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
license: apache-2.0
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI
|
| 2 |
+
import uvicorn
|
| 3 |
+
import sys
|
| 4 |
+
import os
|
| 5 |
+
from fastapi.templating import Jinja2Templates
|
| 6 |
+
from starlette.responses import RedirectResponse
|
| 7 |
+
from fastapi.responses import Response
|
| 8 |
+
from summarylm.pipeline.prediction import PredictionPipeline
|
| 9 |
+
from summarylm.exception import CustomException
|
| 10 |
+
|
| 11 |
+
text:str = "What is Text Summarization?"
|
| 12 |
+
|
| 13 |
+
app = FastAPI()
|
| 14 |
+
|
| 15 |
+
@app.get("/", tags=["authentication"])
|
| 16 |
+
async def index():
|
| 17 |
+
return RedirectResponse(url='/docs')
|
| 18 |
+
|
| 19 |
+
@app.get("/train")
|
| 20 |
+
async def training():
|
| 21 |
+
try:
|
| 22 |
+
os.system("python main.py")
|
| 23 |
+
return Response("Training Successful!!")
|
| 24 |
+
except Exception as e:
|
| 25 |
+
return Response(f"Error Occurred! {e}")
|
| 26 |
+
|
| 27 |
+
@app.post("/predict")
|
| 28 |
+
async def predict_route(text, max_length: int = 128):
|
| 29 |
+
try:
|
| 30 |
+
print(type(max_length))
|
| 31 |
+
obj = PredictionPipeline()
|
| 32 |
+
text = obj.predict(text, max_length)
|
| 33 |
+
return text
|
| 34 |
+
except Exception as e:
|
| 35 |
+
raise CustomException(e, sys) from e
|
| 36 |
+
|
config/config.yaml
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
artifacts_root: artifacts
|
| 2 |
+
|
| 3 |
+
data_ingestion:
|
| 4 |
+
root_dir: artifacts/data_ingestion
|
| 5 |
+
ALL_HUGGINGFACE_DATA: ["d0rj/wikisum", "multi_news"]
|
| 6 |
+
LOCAL_DATA_FILE: ["artifacts/data_ingestion/wikisum", "artifacts/data_ingestion/multi_news"]
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
data_validation:
|
| 10 |
+
root_dir: artifacts/data_validation
|
| 11 |
+
STATUS_FILE: artifacts/data_validation/status.txt
|
| 12 |
+
ALL_REQUIRED_DATA: ["wikisum", "multi_news"]
|
| 13 |
+
ALL_REQUIRED_FILES: ["train", "test", "validation"]
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
data_transformation:
|
| 17 |
+
root_dir: artifacts/data_transformation
|
| 18 |
+
data_path: artifacts/data_ingestion/
|
| 19 |
+
ALL_REQUIRED_DATA: ["wikisum", "multi_news"]
|
| 20 |
+
tokenizer_name: google/pegasus-cnn_dailymail
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
model_trainer:
|
| 24 |
+
root_dir: artifacts/model_trainer
|
| 25 |
+
data_path: artifacts/data_transformation/dataset
|
| 26 |
+
model_ckpt: google/pegasus-cnn_dailymail
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
model_evaluation:
|
| 30 |
+
root_dir: artifacts/model_evaluation
|
| 31 |
+
data_path: artifacts/data_transformation/dataset
|
| 32 |
+
model_path: artifacts/model_trainer/pegasus-summary-lm
|
| 33 |
+
tokenizer_path: artifacts/model_trainer/tokenizer
|
| 34 |
+
metric_file_name: artifacts/model_evaluation/metrics.csv
|
main.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
from summarylm.pipeline.data_ingestion import DataIngestionPipeline
|
| 3 |
+
from summarylm.pipeline.data_validation import DataValidationPipeline
|
| 4 |
+
from summarylm.pipeline.data_transformation import DataTransformationPipeline
|
| 5 |
+
from summarylm.pipeline.model_trainer import ModelTrainerPipeline
|
| 6 |
+
from summarylm.pipeline.model_evaluation import ModelEvaluationPipeline
|
| 7 |
+
from summarylm.logging import logger
|
| 8 |
+
from summarylm.exception import CustomException
|
| 9 |
+
|
| 10 |
+
# data ingestion
|
| 11 |
+
STAGE_NAME = "Data Ingestion"
|
| 12 |
+
|
| 13 |
+
try:
|
| 14 |
+
logger.info(f"Starting {STAGE_NAME} stage...")
|
| 15 |
+
data_ingestion = DataIngestionPipeline()
|
| 16 |
+
data_ingestion.main()
|
| 17 |
+
logger.info(f"Completed {STAGE_NAME} stage...")
|
| 18 |
+
except Exception as e:
|
| 19 |
+
raise CustomException(e, sys) from e
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# data validation
|
| 23 |
+
STAGE_NAME = "Data Validation"
|
| 24 |
+
|
| 25 |
+
try:
|
| 26 |
+
logger.info(f"Starting {STAGE_NAME} stage...")
|
| 27 |
+
data_validation = DataValidationPipeline()
|
| 28 |
+
data_validation.main()
|
| 29 |
+
logger.info(f"Completed {STAGE_NAME} stage...")
|
| 30 |
+
except Exception as e:
|
| 31 |
+
raise CustomException(e, sys) from e
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
# data transformation
|
| 35 |
+
STAGE_NAME = "Data Transformation"
|
| 36 |
+
|
| 37 |
+
try:
|
| 38 |
+
logger.info(f"Starting {STAGE_NAME} stage...")
|
| 39 |
+
data_transformation= DataTransformationPipeline()
|
| 40 |
+
data_transformation.main()
|
| 41 |
+
logger.info(f"Completed {STAGE_NAME} stage...")
|
| 42 |
+
except Exception as e:
|
| 43 |
+
raise CustomException(e, sys) from e
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
# model trainer
|
| 47 |
+
STAGE_NAME = "Model Trainer"
|
| 48 |
+
|
| 49 |
+
try:
|
| 50 |
+
logger.info(f"Starting {STAGE_NAME} stage...")
|
| 51 |
+
model_trainer= ModelTrainerPipeline()
|
| 52 |
+
model_trainer.main()
|
| 53 |
+
logger.info(f"Completed {STAGE_NAME} stage...")
|
| 54 |
+
except Exception as e:
|
| 55 |
+
raise CustomException(e, sys) from e
|
| 56 |
+
|
| 57 |
+
# model evaluation
|
| 58 |
+
STAGE_NAME = "Model Evaluation"
|
| 59 |
+
|
| 60 |
+
try:
|
| 61 |
+
logger.info(f"Starting {STAGE_NAME} stage...")
|
| 62 |
+
model_trainer= ModelEvaluationPipeline()
|
| 63 |
+
model_trainer.main()
|
| 64 |
+
logger.info(f"Completed {STAGE_NAME} stage...")
|
| 65 |
+
except Exception as e:
|
| 66 |
+
raise CustomException(e, sys) from e
|
params.yaml
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
TrainingArguments:
|
| 2 |
+
num_train_epochs: 1
|
| 3 |
+
warmup_steps: 500
|
| 4 |
+
per_device_train_batch_size: 1
|
| 5 |
+
weight_decay: 0.01
|
| 6 |
+
logging_steps: 10
|
| 7 |
+
evaluation_strategy: steps
|
| 8 |
+
eval_steps: 500
|
| 9 |
+
save_steps: 1e6
|
| 10 |
+
gradient_accumulation_steps: 16
|
requirements.txt
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
transformers
|
| 2 |
+
transformers[sentencepiece]
|
| 3 |
+
transformers[torch]
|
| 4 |
+
datasets
|
| 5 |
+
sacrebleu
|
| 6 |
+
rouge_score
|
| 7 |
+
py7zr
|
| 8 |
+
pandas
|
| 9 |
+
nltk
|
| 10 |
+
tqdm
|
| 11 |
+
PyYAML
|
| 12 |
+
matplotlib
|
| 13 |
+
torch
|
| 14 |
+
notebook
|
| 15 |
+
boto3
|
| 16 |
+
mypy-boto3-s3
|
| 17 |
+
python-box==7.1.1
|
| 18 |
+
ensure==1.0.4
|
| 19 |
+
fastapi==0.78.0
|
| 20 |
+
uvicorn==0.29.0
|
| 21 |
+
Jinja2==3.1.4
|
| 22 |
+
google-cloud-storage
|
| 23 |
+
-e .
|
research/data_ingestion.ipynb
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"outputs": [],
|
| 8 |
+
"source": [
|
| 9 |
+
"import os\n",
|
| 10 |
+
"import zipfile\n",
|
| 11 |
+
"from dataclasses import dataclass\n",
|
| 12 |
+
"from pathlib import Path\n",
|
| 13 |
+
"\n",
|
| 14 |
+
"from summarylm.logging import logger\n",
|
| 15 |
+
"from summarylm.constants import *\n",
|
| 16 |
+
"from summarylm.utils.common import read_yaml, create_directories, get_size"
|
| 17 |
+
]
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"cell_type": "code",
|
| 21 |
+
"execution_count": 2,
|
| 22 |
+
"metadata": {},
|
| 23 |
+
"outputs": [],
|
| 24 |
+
"source": [
|
| 25 |
+
"os.chdir(\"../\")"
|
| 26 |
+
]
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"cell_type": "code",
|
| 30 |
+
"execution_count": 5,
|
| 31 |
+
"metadata": {},
|
| 32 |
+
"outputs": [],
|
| 33 |
+
"source": [
|
| 34 |
+
"@dataclass(frozen=True)\n",
|
| 35 |
+
"class DataIngestionConfig:\n",
|
| 36 |
+
" root_dir: Path\n",
|
| 37 |
+
" ALL_HUGGINGFACE_DATA: list\n",
|
| 38 |
+
" LOCAL_DATA_FILE: list"
|
| 39 |
+
]
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"cell_type": "code",
|
| 43 |
+
"execution_count": 6,
|
| 44 |
+
"metadata": {},
|
| 45 |
+
"outputs": [],
|
| 46 |
+
"source": [
|
| 47 |
+
"class ConfigurationManager:\n",
|
| 48 |
+
" def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH) -> None:\n",
|
| 49 |
+
" self.config = read_yaml(config_filepath)\n",
|
| 50 |
+
" self.params = read_yaml(params_filepath)\n",
|
| 51 |
+
"\n",
|
| 52 |
+
" create_directories([self.config.artifacts_root])\n",
|
| 53 |
+
"\n",
|
| 54 |
+
" def get_data_ingestion_config(self) -> DataIngestionConfig:\n",
|
| 55 |
+
" config = self.config.data_ingestion\n",
|
| 56 |
+
"\n",
|
| 57 |
+
" create_directories([config.root_dir])\n",
|
| 58 |
+
"\n",
|
| 59 |
+
" data_ingestion_config = DataIngestionConfig(\n",
|
| 60 |
+
" root_dir=config.root_dir,\n",
|
| 61 |
+
" ALL_HUGGINGFACE_DATA=config.ALL_HUGGINGFACE_DATA,\n",
|
| 62 |
+
" LOCAL_DATA_FILE = config.LOCAL_DATA_FILE,\n",
|
| 63 |
+
" )\n",
|
| 64 |
+
"\n",
|
| 65 |
+
" return data_ingestion_config"
|
| 66 |
+
]
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"cell_type": "code",
|
| 70 |
+
"execution_count": 14,
|
| 71 |
+
"metadata": {},
|
| 72 |
+
"outputs": [],
|
| 73 |
+
"source": [
|
| 74 |
+
"from datasets import load_dataset\n",
|
| 75 |
+
"\n",
|
| 76 |
+
"class DataIngestion:\n",
|
| 77 |
+
" def __init__(self, config: DataIngestionConfig):\n",
|
| 78 |
+
" self.config = config\n",
|
| 79 |
+
"\n",
|
| 80 |
+
" def download_data(self):\n",
|
| 81 |
+
" for i in range(len(self.config.LOCAL_DATA_FILE)):\n",
|
| 82 |
+
" if not os.path.exists(self.config.LOCAL_DATA_FILE[i]):\n",
|
| 83 |
+
" dataset = load_dataset(self.config.ALL_HUGGINGFACE_DATA[i])\n",
|
| 84 |
+
" dataset.save_to_disk(self.config.LOCAL_DATA_FILE[i])\n",
|
| 85 |
+
" logger.info(f\"{self.config.ALL_HUGGINGFACE_DATA[i]} downloaded!\")\n",
|
| 86 |
+
" else:\n",
|
| 87 |
+
" logger.info(f\"File already exists of size: {get_size(Path(self.config.LOCAL_DATA_FILE[i]))}\")"
|
| 88 |
+
]
|
| 89 |
+
},
|
| 90 |
+
{
|
| 91 |
+
"cell_type": "code",
|
| 92 |
+
"execution_count": 15,
|
| 93 |
+
"metadata": {},
|
| 94 |
+
"outputs": [
|
| 95 |
+
{
|
| 96 |
+
"name": "stdout",
|
| 97 |
+
"output_type": "stream",
|
| 98 |
+
"text": [
|
| 99 |
+
"[2024-05-23 07:53:04,706: INFO: common: Yaml file: config\\config.yaml loaded successfully]\n",
|
| 100 |
+
"[2024-05-23 07:53:04,709: INFO: common: Yaml file: params.yaml loaded successfully]\n",
|
| 101 |
+
"[2024-05-23 07:53:04,710: INFO: common: Directory created successfully at: artifacts]\n",
|
| 102 |
+
"[2024-05-23 07:53:04,711: INFO: common: Directory created successfully at: artifacts/data_ingestion]\n",
|
| 103 |
+
"[2024-05-23 07:53:04,711: INFO: 368978256: File already exists of size: ~0 KB]\n"
|
| 104 |
+
]
|
| 105 |
+
},
|
| 106 |
+
{
|
| 107 |
+
"name": "stderr",
|
| 108 |
+
"output_type": "stream",
|
| 109 |
+
"text": [
|
| 110 |
+
"Downloading data: 100%|██████████| 295M/295M [00:34<00:00, 8.46MB/s] \n",
|
| 111 |
+
"Downloading data: 100%|██████████| 28.3M/28.3M [00:05<00:00, 5.38MB/s]\n",
|
| 112 |
+
"Downloading data: 100%|██████████| 39.5M/39.5M [00:06<00:00, 5.72MB/s]\n",
|
| 113 |
+
"Downloading data: 100%|██████████| 40.1M/40.1M [00:06<00:00, 5.83MB/s]\n",
|
| 114 |
+
"Generating train split: 100%|██████████| 44972/44972 [00:03<00:00, 13618.69 examples/s]\n",
|
| 115 |
+
"Generating validation split: 100%|██████████| 5622/5622 [00:00<00:00, 25120.36 examples/s]\n",
|
| 116 |
+
"Generating test split: 100%|██████████| 5622/5622 [00:00<00:00, 22323.24 examples/s]\n",
|
| 117 |
+
"Saving the dataset (2/2 shards): 100%|██████████| 44972/44972 [00:07<00:00, 5653.51 examples/s] \n",
|
| 118 |
+
"Saving the dataset (1/1 shards): 100%|██████████| 5622/5622 [00:00<00:00, 15343.69 examples/s]\n",
|
| 119 |
+
"Saving the dataset (1/1 shards): 100%|██████████| 5622/5622 [00:00<00:00, 15216.24 examples/s]"
|
| 120 |
+
]
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"name": "stdout",
|
| 124 |
+
"output_type": "stream",
|
| 125 |
+
"text": [
|
| 126 |
+
"[2024-05-23 07:54:25,968: INFO: 368978256: multi_news downloaded!]\n"
|
| 127 |
+
]
|
| 128 |
+
},
|
| 129 |
+
{
|
| 130 |
+
"name": "stderr",
|
| 131 |
+
"output_type": "stream",
|
| 132 |
+
"text": [
|
| 133 |
+
"\n"
|
| 134 |
+
]
|
| 135 |
+
}
|
| 136 |
+
],
|
| 137 |
+
"source": [
|
| 138 |
+
"try:\n",
|
| 139 |
+
" config = ConfigurationManager()\n",
|
| 140 |
+
" data_ingestion_config = config.get_data_ingestion_config()\n",
|
| 141 |
+
" data_ingestion = DataIngestion(config=data_ingestion_config)\n",
|
| 142 |
+
" data_ingestion.download_data()\n",
|
| 143 |
+
"except Exception as e:\n",
|
| 144 |
+
" raise e "
|
| 145 |
+
]
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"cell_type": "code",
|
| 149 |
+
"execution_count": null,
|
| 150 |
+
"metadata": {},
|
| 151 |
+
"outputs": [],
|
| 152 |
+
"source": []
|
| 153 |
+
},
|
| 154 |
+
{
|
| 155 |
+
"cell_type": "code",
|
| 156 |
+
"execution_count": null,
|
| 157 |
+
"metadata": {},
|
| 158 |
+
"outputs": [],
|
| 159 |
+
"source": []
|
| 160 |
+
}
|
| 161 |
+
],
|
| 162 |
+
"metadata": {
|
| 163 |
+
"kernelspec": {
|
| 164 |
+
"display_name": "env",
|
| 165 |
+
"language": "python",
|
| 166 |
+
"name": "python3"
|
| 167 |
+
},
|
| 168 |
+
"language_info": {
|
| 169 |
+
"codemirror_mode": {
|
| 170 |
+
"name": "ipython",
|
| 171 |
+
"version": 3
|
| 172 |
+
},
|
| 173 |
+
"file_extension": ".py",
|
| 174 |
+
"mimetype": "text/x-python",
|
| 175 |
+
"name": "python",
|
| 176 |
+
"nbconvert_exporter": "python",
|
| 177 |
+
"pygments_lexer": "ipython3",
|
| 178 |
+
"version": "3.12.2"
|
| 179 |
+
}
|
| 180 |
+
},
|
| 181 |
+
"nbformat": 4,
|
| 182 |
+
"nbformat_minor": 2
|
| 183 |
+
}
|
research/data_transformation.ipynb
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"outputs": [],
|
| 8 |
+
"source": [
|
| 9 |
+
"import os\n",
|
| 10 |
+
"\n",
|
| 11 |
+
"os.chdir(\"../\")"
|
| 12 |
+
]
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"cell_type": "code",
|
| 16 |
+
"execution_count": 2,
|
| 17 |
+
"metadata": {},
|
| 18 |
+
"outputs": [],
|
| 19 |
+
"source": [
|
| 20 |
+
"from dataclasses import dataclass\n",
|
| 21 |
+
"from pathlib import Path\n",
|
| 22 |
+
"\n",
|
| 23 |
+
"@dataclass(frozen=True)\n",
|
| 24 |
+
"class DataTransformationConfig:\n",
|
| 25 |
+
" root_dir: Path\n",
|
| 26 |
+
" data_path: Path\n",
|
| 27 |
+
" ALL_REQUIRED_DATA: Path\n",
|
| 28 |
+
" tokenizer_name: Path"
|
| 29 |
+
]
|
| 30 |
+
},
|
| 31 |
+
{
|
| 32 |
+
"cell_type": "code",
|
| 33 |
+
"execution_count": 3,
|
| 34 |
+
"metadata": {},
|
| 35 |
+
"outputs": [],
|
| 36 |
+
"source": [
|
| 37 |
+
"from summarylm.constants import *\n",
|
| 38 |
+
"from summarylm.utils.common import read_yaml, create_directories"
|
| 39 |
+
]
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"cell_type": "code",
|
| 43 |
+
"execution_count": 4,
|
| 44 |
+
"metadata": {},
|
| 45 |
+
"outputs": [],
|
| 46 |
+
"source": [
|
| 47 |
+
"class ConfigurationManager:\n",
|
| 48 |
+
" def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH):\n",
|
| 49 |
+
" self.config = read_yaml(config_filepath)\n",
|
| 50 |
+
" self.params = read_yaml(params_filepath)\n",
|
| 51 |
+
"\n",
|
| 52 |
+
" create_directories([self.config.artifacts_root])\n",
|
| 53 |
+
"\n",
|
| 54 |
+
" def get_data_transformation_config(self) -> DataTransformationConfig:\n",
|
| 55 |
+
" config = self.config.data_transformation\n",
|
| 56 |
+
"\n",
|
| 57 |
+
" create_directories([config.root_dir])\n",
|
| 58 |
+
"\n",
|
| 59 |
+
" data_transformation_config = DataTransformationConfig(\n",
|
| 60 |
+
" root_dir=config.root_dir,\n",
|
| 61 |
+
" data_path=config.data_path,\n",
|
| 62 |
+
" ALL_REQUIRED_DATA=config.ALL_REQUIRED_DATA,\n",
|
| 63 |
+
" tokenizer_name=config.tokenizer_name\n",
|
| 64 |
+
" )\n",
|
| 65 |
+
"\n",
|
| 66 |
+
" return data_transformation_config"
|
| 67 |
+
]
|
| 68 |
+
},
|
| 69 |
+
{
|
| 70 |
+
"cell_type": "code",
|
| 71 |
+
"execution_count": 7,
|
| 72 |
+
"metadata": {},
|
| 73 |
+
"outputs": [],
|
| 74 |
+
"source": [
|
| 75 |
+
"import os\n",
|
| 76 |
+
"import sys\n",
|
| 77 |
+
"from summarylm.logging import logger\n",
|
| 78 |
+
"from summarylm.exception import CustomException\n",
|
| 79 |
+
"from transformers import AutoTokenizer\n",
|
| 80 |
+
"from datasets import load_dataset, load_from_disk\n",
|
| 81 |
+
"from datasets import concatenate_datasets, DatasetDict"
|
| 82 |
+
]
|
| 83 |
+
},
|
| 84 |
+
{
|
| 85 |
+
"cell_type": "code",
|
| 86 |
+
"execution_count": 24,
|
| 87 |
+
"metadata": {},
|
| 88 |
+
"outputs": [],
|
| 89 |
+
"source": [
|
| 90 |
+
"class DataTransformation:\n",
|
| 91 |
+
" def __init__(self, config: DataTransformationConfig):\n",
|
| 92 |
+
" self.config = config\n",
|
| 93 |
+
" self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)\n",
|
| 94 |
+
"\n",
|
| 95 |
+
" def convert_data_into_right_format(self, datasets: list) -> DatasetDict:\n",
|
| 96 |
+
" # loading all datasets\n",
|
| 97 |
+
" loaded_datasets = {}\n",
|
| 98 |
+
" print(\"Loading the dataset\")\n",
|
| 99 |
+
" for data in datasets:\n",
|
| 100 |
+
" loaded_datasets[data] = load_from_disk(data)\n",
|
| 101 |
+
"\n",
|
| 102 |
+
" dataset1 = loaded_datasets[datasets[0]]\n",
|
| 103 |
+
" dataset2 = loaded_datasets[datasets[1]]\n",
|
| 104 |
+
" print(\"Dataset loaded\")\n",
|
| 105 |
+
"\n",
|
| 106 |
+
" # removing unwanted columns from dataset1\n",
|
| 107 |
+
" dataset1_train = dataset1['train'].select_columns(['article', 'summary'])\n",
|
| 108 |
+
" dataset1_test = dataset1['test'].select_columns(['article', 'summary'])\n",
|
| 109 |
+
" dataset1_validation = dataset1['validation'].select_columns(['article', 'summary'])\n",
|
| 110 |
+
"\n",
|
| 111 |
+
" # renaming data column name of dataset1\n",
|
| 112 |
+
" dataset1_train = dataset1_train.rename_column('article', 'text')\n",
|
| 113 |
+
" dataset1_test = dataset1_test.rename_column('article', 'text')\n",
|
| 114 |
+
" dataset1_validation = dataset1_validation.rename_column('article', 'text')\n",
|
| 115 |
+
"\n",
|
| 116 |
+
" # renaming data column name of dataset2\n",
|
| 117 |
+
" dataset2_train = dataset2['train'].rename_column('document', 'text')\n",
|
| 118 |
+
" dataset2_test = dataset2['test'].rename_column('document', 'text')\n",
|
| 119 |
+
" dataset2_validation = dataset2['validation'].rename_column('document', 'text')\n",
|
| 120 |
+
"\n",
|
| 121 |
+
" # concatenate_datasets\n",
|
| 122 |
+
" dataset_train = concatenate_datasets([dataset1_train, dataset2_train])\n",
|
| 123 |
+
" dataset_test = concatenate_datasets([dataset1_test, dataset2_test])\n",
|
| 124 |
+
" dataset_validation = concatenate_datasets([dataset1_validation, dataset2_validation])\n",
|
| 125 |
+
"\n",
|
| 126 |
+
" # loading teh dataset into DatasetDict\n",
|
| 127 |
+
" dataset = DatasetDict({\n",
|
| 128 |
+
" \"train\": dataset_train,\n",
|
| 129 |
+
" \"validation\": dataset_validation,\n",
|
| 130 |
+
" \"test\": dataset_test,\n",
|
| 131 |
+
" })\n",
|
| 132 |
+
"\n",
|
| 133 |
+
" return dataset\n",
|
| 134 |
+
"\n",
|
| 135 |
+
" def convert_examples_to_features(self, example_batch):\n",
|
| 136 |
+
" input_encodings = self.tokenizer(example_batch['text'], max_length = 1024, truncation = True)\n",
|
| 137 |
+
" \n",
|
| 138 |
+
" with self.tokenizer.as_target_tokenizer():\n",
|
| 139 |
+
" target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True)\n",
|
| 140 |
+
" \n",
|
| 141 |
+
" return {\n",
|
| 142 |
+
" 'input_ids': input_encodings['input_ids'],\n",
|
| 143 |
+
" 'attention_mask': input_encodings['attention_mask'],\n",
|
| 144 |
+
" 'labels': target_encodings['input_ids']\n",
|
| 145 |
+
" }\n",
|
| 146 |
+
" \n",
|
| 147 |
+
" def convert(self):\n",
|
| 148 |
+
" data1 = os.path.join(self.config.data_path, self.config.ALL_REQUIRED_DATA[0])\n",
|
| 149 |
+
" data2 = os.path.join(self.config.data_path, self.config.ALL_REQUIRED_DATA[1])\n",
|
| 150 |
+
"\n",
|
| 151 |
+
" dataset = self.convert_data_into_right_format([data1, data2])\n",
|
| 152 |
+
" dataset_pt = dataset.map(self.convert_examples_to_features, batched=True)\n",
|
| 153 |
+
" dataset_pt.save_to_disk(os.path.join(self.config.root_dir, \"dataset\"))"
|
| 154 |
+
]
|
| 155 |
+
},
|
| 156 |
+
{
|
| 157 |
+
"cell_type": "code",
|
| 158 |
+
"execution_count": 25,
|
| 159 |
+
"metadata": {},
|
| 160 |
+
"outputs": [
|
| 161 |
+
{
|
| 162 |
+
"name": "stdout",
|
| 163 |
+
"output_type": "stream",
|
| 164 |
+
"text": [
|
| 165 |
+
"[2024-05-23 09:04:24,048: INFO: common: Yaml file: config\\config.yaml loaded successfully]\n",
|
| 166 |
+
"[2024-05-23 09:04:24,051: INFO: common: Yaml file: params.yaml loaded successfully]\n",
|
| 167 |
+
"[2024-05-23 09:04:24,052: INFO: common: Directory created successfully at: artifacts]\n",
|
| 168 |
+
"[2024-05-23 09:04:24,053: INFO: common: Directory created successfully at: artifacts/data_transformation]\n",
|
| 169 |
+
"Loading the dataset\n",
|
| 170 |
+
"Dataset loaded\n"
|
| 171 |
+
]
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"name": "stderr",
|
| 175 |
+
"output_type": "stream",
|
| 176 |
+
"text": [
|
| 177 |
+
"Map: 0%| | 0/80747 [00:00<?, ? examples/s]d:\\Satyam Mishra\\NLP Project\\Text Summarization\\env\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:3921: UserWarning: `as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your labels by using the argument `text_target` of the regular `__call__` method (either in the same call as your input texts if you use the same keyword arguments, or in a separate call.\n",
|
| 178 |
+
" warnings.warn(\n",
|
| 179 |
+
"Map: 100%|██████████| 80747/80747 [11:43<00:00, 114.72 examples/s]\n",
|
| 180 |
+
"Map: 100%|██████████| 7622/7622 [01:20<00:00, 94.22 examples/s] \n",
|
| 181 |
+
"Map: 100%|██████████| 7622/7622 [01:59<00:00, 63.80 examples/s] \n",
|
| 182 |
+
"Saving the dataset (3/3 shards): 100%|██████████| 80747/80747 [00:13<00:00, 5803.62 examples/s] \n",
|
| 183 |
+
"Saving the dataset (1/1 shards): 100%|██████████| 7622/7622 [00:01<00:00, 4202.00 examples/s]\n",
|
| 184 |
+
"Saving the dataset (1/1 shards): 100%|██████████| 7622/7622 [00:01<00:00, 6924.25 examples/s]\n"
|
| 185 |
+
]
|
| 186 |
+
}
|
| 187 |
+
],
|
| 188 |
+
"source": [
|
| 189 |
+
"try:\n",
|
| 190 |
+
" config = ConfigurationManager()\n",
|
| 191 |
+
" data_transformation_config = config.get_data_transformation_config()\n",
|
| 192 |
+
" data_transformation = DataTransformation(config=data_transformation_config)\n",
|
| 193 |
+
" data_transformation.convert()\n",
|
| 194 |
+
"except Exception as e:\n",
|
| 195 |
+
" raise e"
|
| 196 |
+
]
|
| 197 |
+
},
|
| 198 |
+
{
|
| 199 |
+
"cell_type": "code",
|
| 200 |
+
"execution_count": null,
|
| 201 |
+
"metadata": {},
|
| 202 |
+
"outputs": [],
|
| 203 |
+
"source": []
|
| 204 |
+
},
|
| 205 |
+
{
|
| 206 |
+
"cell_type": "code",
|
| 207 |
+
"execution_count": null,
|
| 208 |
+
"metadata": {},
|
| 209 |
+
"outputs": [],
|
| 210 |
+
"source": []
|
| 211 |
+
}
|
| 212 |
+
],
|
| 213 |
+
"metadata": {
|
| 214 |
+
"kernelspec": {
|
| 215 |
+
"display_name": "env",
|
| 216 |
+
"language": "python",
|
| 217 |
+
"name": "python3"
|
| 218 |
+
},
|
| 219 |
+
"language_info": {
|
| 220 |
+
"codemirror_mode": {
|
| 221 |
+
"name": "ipython",
|
| 222 |
+
"version": 3
|
| 223 |
+
},
|
| 224 |
+
"file_extension": ".py",
|
| 225 |
+
"mimetype": "text/x-python",
|
| 226 |
+
"name": "python",
|
| 227 |
+
"nbconvert_exporter": "python",
|
| 228 |
+
"pygments_lexer": "ipython3",
|
| 229 |
+
"version": "3.12.2"
|
| 230 |
+
}
|
| 231 |
+
},
|
| 232 |
+
"nbformat": 4,
|
| 233 |
+
"nbformat_minor": 2
|
| 234 |
+
}
|
research/data_validation.ipynb
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"outputs": [],
|
| 8 |
+
"source": [
|
| 9 |
+
"import os"
|
| 10 |
+
]
|
| 11 |
+
},
|
| 12 |
+
{
|
| 13 |
+
"cell_type": "code",
|
| 14 |
+
"execution_count": 2,
|
| 15 |
+
"metadata": {},
|
| 16 |
+
"outputs": [],
|
| 17 |
+
"source": [
|
| 18 |
+
"os.chdir(\"../\")"
|
| 19 |
+
]
|
| 20 |
+
},
|
| 21 |
+
{
|
| 22 |
+
"cell_type": "code",
|
| 23 |
+
"execution_count": 3,
|
| 24 |
+
"metadata": {},
|
| 25 |
+
"outputs": [
|
| 26 |
+
{
|
| 27 |
+
"data": {
|
| 28 |
+
"text/plain": [
|
| 29 |
+
"'d:\\\\Satyam Mishra\\\\NLP Project\\\\Text Summarization'"
|
| 30 |
+
]
|
| 31 |
+
},
|
| 32 |
+
"execution_count": 3,
|
| 33 |
+
"metadata": {},
|
| 34 |
+
"output_type": "execute_result"
|
| 35 |
+
}
|
| 36 |
+
],
|
| 37 |
+
"source": [
|
| 38 |
+
"%pwd"
|
| 39 |
+
]
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"cell_type": "code",
|
| 43 |
+
"execution_count": 5,
|
| 44 |
+
"metadata": {},
|
| 45 |
+
"outputs": [],
|
| 46 |
+
"source": [
|
| 47 |
+
"from dataclasses import dataclass\n",
|
| 48 |
+
"from pathlib import Path\n",
|
| 49 |
+
"\n",
|
| 50 |
+
"@dataclass(frozen=True)\n",
|
| 51 |
+
"class DataValidationConfig:\n",
|
| 52 |
+
" root_dir: Path\n",
|
| 53 |
+
" STATUS_FILE: str\n",
|
| 54 |
+
" ALL_REQUIRED_DATA: list\n",
|
| 55 |
+
" ALL_REQUIRED_FILES: list"
|
| 56 |
+
]
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"cell_type": "code",
|
| 60 |
+
"execution_count": 6,
|
| 61 |
+
"metadata": {},
|
| 62 |
+
"outputs": [],
|
| 63 |
+
"source": [
|
| 64 |
+
"from summarylm.constants import *\n",
|
| 65 |
+
"from summarylm.utils.common import read_yaml, create_directories"
|
| 66 |
+
]
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"cell_type": "code",
|
| 70 |
+
"execution_count": 7,
|
| 71 |
+
"metadata": {},
|
| 72 |
+
"outputs": [],
|
| 73 |
+
"source": [
|
| 74 |
+
"class ConfigurationManager:\n",
|
| 75 |
+
" def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH):\n",
|
| 76 |
+
" self.config = read_yaml(config_filepath)\n",
|
| 77 |
+
" self.params = read_yaml(params_filepath)\n",
|
| 78 |
+
"\n",
|
| 79 |
+
" create_directories([self.config.artifacts_root])\n",
|
| 80 |
+
"\n",
|
| 81 |
+
" def get_data_validation_config(self) -> DataValidationConfig:\n",
|
| 82 |
+
" config = self.config.data_validation\n",
|
| 83 |
+
"\n",
|
| 84 |
+
" create_directories([config.root_dir])\n",
|
| 85 |
+
"\n",
|
| 86 |
+
" data_validation_config = DataValidationConfig(\n",
|
| 87 |
+
" root_dir=config.root_dir,\n",
|
| 88 |
+
" STATUS_FILE=config.STATUS_FILE,\n",
|
| 89 |
+
" ALL_REQUIRED_DATA=config.ALL_REQUIRED_DATA,\n",
|
| 90 |
+
" ALL_REQUIRED_FILES=config.ALL_REQUIRED_FILES,\n",
|
| 91 |
+
" )\n",
|
| 92 |
+
"\n",
|
| 93 |
+
" return data_validation_config"
|
| 94 |
+
]
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"cell_type": "code",
|
| 98 |
+
"execution_count": 8,
|
| 99 |
+
"metadata": {},
|
| 100 |
+
"outputs": [],
|
| 101 |
+
"source": [
|
| 102 |
+
"import os\n",
|
| 103 |
+
"import sys\n",
|
| 104 |
+
"from summarylm.logging import logger\n",
|
| 105 |
+
"from summarylm.exception import CustomException"
|
| 106 |
+
]
|
| 107 |
+
},
|
| 108 |
+
{
|
| 109 |
+
"cell_type": "code",
|
| 110 |
+
"execution_count": 9,
|
| 111 |
+
"metadata": {},
|
| 112 |
+
"outputs": [],
|
| 113 |
+
"source": [
|
| 114 |
+
"class DataValidation:\n",
|
| 115 |
+
" def __init__(self, config: DataValidationConfig):\n",
|
| 116 |
+
" self.config = config\n",
|
| 117 |
+
"\n",
|
| 118 |
+
" def validate_all_files_exist(self) -> bool:\n",
|
| 119 |
+
" try:\n",
|
| 120 |
+
" validation_status = None\n",
|
| 121 |
+
"\n",
|
| 122 |
+
" for data in self.config.ALL_REQUIRED_DATA:\n",
|
| 123 |
+
" all_files = os.listdir(os.path.join(\"artifacts\", \"data_ingestion\", data))\n",
|
| 124 |
+
"\n",
|
| 125 |
+
" for file in all_files:\n",
|
| 126 |
+
" if file not in self.config.ALL_REQUIRED_FILES:\n",
|
| 127 |
+
" validation_status = False\n",
|
| 128 |
+
"\n",
|
| 129 |
+
" with open(self.config.STATUS_FILE, 'w') as f:\n",
|
| 130 |
+
" f.write(f\"Validation status: {validation_status}\")\n",
|
| 131 |
+
" else:\n",
|
| 132 |
+
" validation_status = True\n",
|
| 133 |
+
"\n",
|
| 134 |
+
" with open(self.config.STATUS_FILE, 'w') as f:\n",
|
| 135 |
+
" f.write(f\"Validation status: {validation_status}\")\n",
|
| 136 |
+
"\n",
|
| 137 |
+
" return validation_status\n",
|
| 138 |
+
" except Exception as e:\n",
|
| 139 |
+
" raise CustomException(e, sys) from e"
|
| 140 |
+
]
|
| 141 |
+
},
|
| 142 |
+
{
|
| 143 |
+
"cell_type": "code",
|
| 144 |
+
"execution_count": 10,
|
| 145 |
+
"metadata": {},
|
| 146 |
+
"outputs": [
|
| 147 |
+
{
|
| 148 |
+
"name": "stdout",
|
| 149 |
+
"output_type": "stream",
|
| 150 |
+
"text": [
|
| 151 |
+
"[2024-05-23 08:11:43,852: INFO: common: Yaml file: config\\config.yaml loaded successfully]\n",
|
| 152 |
+
"[2024-05-23 08:11:43,856: INFO: common: Yaml file: params.yaml loaded successfully]\n",
|
| 153 |
+
"[2024-05-23 08:11:43,857: INFO: common: Directory created successfully at: artifacts]\n",
|
| 154 |
+
"[2024-05-23 08:11:43,858: INFO: common: Directory created successfully at: artifacts/data_validation]\n"
|
| 155 |
+
]
|
| 156 |
+
}
|
| 157 |
+
],
|
| 158 |
+
"source": [
|
| 159 |
+
"try:\n",
|
| 160 |
+
" config = ConfigurationManager()\n",
|
| 161 |
+
" data_validation_config = config.get_data_validation_config()\n",
|
| 162 |
+
" data_validation = DataValidation(config=data_validation_config)\n",
|
| 163 |
+
" data_validation.validate_all_files_exist()\n",
|
| 164 |
+
"except Exception as e:\n",
|
| 165 |
+
" raise CustomException(e, sys) from e"
|
| 166 |
+
]
|
| 167 |
+
},
|
| 168 |
+
{
|
| 169 |
+
"cell_type": "code",
|
| 170 |
+
"execution_count": null,
|
| 171 |
+
"metadata": {},
|
| 172 |
+
"outputs": [],
|
| 173 |
+
"source": []
|
| 174 |
+
}
|
| 175 |
+
],
|
| 176 |
+
"metadata": {
|
| 177 |
+
"kernelspec": {
|
| 178 |
+
"display_name": "env",
|
| 179 |
+
"language": "python",
|
| 180 |
+
"name": "python3"
|
| 181 |
+
},
|
| 182 |
+
"language_info": {
|
| 183 |
+
"codemirror_mode": {
|
| 184 |
+
"name": "ipython",
|
| 185 |
+
"version": 3
|
| 186 |
+
},
|
| 187 |
+
"file_extension": ".py",
|
| 188 |
+
"mimetype": "text/x-python",
|
| 189 |
+
"name": "python",
|
| 190 |
+
"nbconvert_exporter": "python",
|
| 191 |
+
"pygments_lexer": "ipython3",
|
| 192 |
+
"version": "3.12.2"
|
| 193 |
+
}
|
| 194 |
+
},
|
| 195 |
+
"nbformat": 4,
|
| 196 |
+
"nbformat_minor": 2
|
| 197 |
+
}
|
research/experiment.ipynb
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"outputs": [],
|
| 8 |
+
"source": [
|
| 9 |
+
"from ensure import ensure_annotations"
|
| 10 |
+
]
|
| 11 |
+
},
|
| 12 |
+
{
|
| 13 |
+
"cell_type": "code",
|
| 14 |
+
"execution_count": 5,
|
| 15 |
+
"metadata": {},
|
| 16 |
+
"outputs": [],
|
| 17 |
+
"source": [
|
| 18 |
+
"@ensure_annotations\n",
|
| 19 |
+
"def multiply(a: int, b: int) -> int:\n",
|
| 20 |
+
" return a * b"
|
| 21 |
+
]
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"cell_type": "code",
|
| 25 |
+
"execution_count": 6,
|
| 26 |
+
"metadata": {},
|
| 27 |
+
"outputs": [
|
| 28 |
+
{
|
| 29 |
+
"ename": "EnsureError",
|
| 30 |
+
"evalue": "Argument b of type <class 'str'> to <function multiply at 0x000001A07A809D00> does not match annotation type <class 'int'>",
|
| 31 |
+
"output_type": "error",
|
| 32 |
+
"traceback": [
|
| 33 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
| 34 |
+
"\u001b[1;31mEnsureError\u001b[0m Traceback (most recent call last)",
|
| 35 |
+
"Cell \u001b[1;32mIn[6], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[43mmultiply\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m2\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m3\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
|
| 36 |
+
"File \u001b[1;32md:\\Satyam Mishra\\NLP Project\\Text Summarization\\env\\Lib\\site-packages\\ensure\\main.py:870\u001b[0m, in \u001b[0;36mWrappedFunctionReturn.__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 868\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(value, templ):\n\u001b[0;32m 869\u001b[0m msg \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mArgument \u001b[39m\u001b[38;5;132;01m{arg}\u001b[39;00m\u001b[38;5;124m of type \u001b[39m\u001b[38;5;132;01m{valt}\u001b[39;00m\u001b[38;5;124m to \u001b[39m\u001b[38;5;132;01m{f}\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdoes not match annotation type \u001b[39m\u001b[38;5;132;01m{t}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m--> 870\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m EnsureError(msg\u001b[38;5;241m.\u001b[39mformat(arg\u001b[38;5;241m=\u001b[39marg, f\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mf, t\u001b[38;5;241m=\u001b[39mtempl, valt\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mtype\u001b[39m(value)))\n\u001b[0;32m 872\u001b[0m return_val \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mf(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 873\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(return_val, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturn_templ):\n",
|
| 37 |
+
"\u001b[1;31mEnsureError\u001b[0m: Argument b of type <class 'str'> to <function multiply at 0x000001A07A809D00> does not match annotation type <class 'int'>"
|
| 38 |
+
]
|
| 39 |
+
}
|
| 40 |
+
],
|
| 41 |
+
"source": [
|
| 42 |
+
"multiply(2, \"3\")"
|
| 43 |
+
]
|
| 44 |
+
},
|
| 45 |
+
{
|
| 46 |
+
"cell_type": "code",
|
| 47 |
+
"execution_count": null,
|
| 48 |
+
"metadata": {},
|
| 49 |
+
"outputs": [],
|
| 50 |
+
"source": []
|
| 51 |
+
}
|
| 52 |
+
],
|
| 53 |
+
"metadata": {
|
| 54 |
+
"kernelspec": {
|
| 55 |
+
"display_name": "env",
|
| 56 |
+
"language": "python",
|
| 57 |
+
"name": "python3"
|
| 58 |
+
},
|
| 59 |
+
"language_info": {
|
| 60 |
+
"codemirror_mode": {
|
| 61 |
+
"name": "ipython",
|
| 62 |
+
"version": 3
|
| 63 |
+
},
|
| 64 |
+
"file_extension": ".py",
|
| 65 |
+
"mimetype": "text/x-python",
|
| 66 |
+
"name": "python",
|
| 67 |
+
"nbconvert_exporter": "python",
|
| 68 |
+
"pygments_lexer": "ipython3",
|
| 69 |
+
"version": "3.12.2"
|
| 70 |
+
}
|
| 71 |
+
},
|
| 72 |
+
"nbformat": 4,
|
| 73 |
+
"nbformat_minor": 2
|
| 74 |
+
}
|
research/model_evaluation.ipynb
ADDED
|
@@ -0,0 +1,265 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"outputs": [],
|
| 8 |
+
"source": [
|
| 9 |
+
"import os"
|
| 10 |
+
]
|
| 11 |
+
},
|
| 12 |
+
{
|
| 13 |
+
"cell_type": "code",
|
| 14 |
+
"execution_count": 2,
|
| 15 |
+
"metadata": {},
|
| 16 |
+
"outputs": [],
|
| 17 |
+
"source": [
|
| 18 |
+
"os.chdir(\"../\")"
|
| 19 |
+
]
|
| 20 |
+
},
|
| 21 |
+
{
|
| 22 |
+
"cell_type": "code",
|
| 23 |
+
"execution_count": 3,
|
| 24 |
+
"metadata": {},
|
| 25 |
+
"outputs": [
|
| 26 |
+
{
|
| 27 |
+
"data": {
|
| 28 |
+
"text/plain": [
|
| 29 |
+
"'d:\\\\Satyam Mishra\\\\NLP Project\\\\Text Summarization'"
|
| 30 |
+
]
|
| 31 |
+
},
|
| 32 |
+
"execution_count": 3,
|
| 33 |
+
"metadata": {},
|
| 34 |
+
"output_type": "execute_result"
|
| 35 |
+
}
|
| 36 |
+
],
|
| 37 |
+
"source": [
|
| 38 |
+
"%pwd"
|
| 39 |
+
]
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"cell_type": "code",
|
| 43 |
+
"execution_count": 4,
|
| 44 |
+
"metadata": {},
|
| 45 |
+
"outputs": [],
|
| 46 |
+
"source": [
|
| 47 |
+
"from dataclasses import dataclass\n",
|
| 48 |
+
"from pathlib import Path\n",
|
| 49 |
+
"\n",
|
| 50 |
+
"@dataclass(frozen=True)\n",
|
| 51 |
+
"class ModelEvaluationConfig:\n",
|
| 52 |
+
" root_dir: Path\n",
|
| 53 |
+
" data_path: Path\n",
|
| 54 |
+
" model_path: Path\n",
|
| 55 |
+
" tokenizer_path: Path\n",
|
| 56 |
+
" metric_file_name: Path"
|
| 57 |
+
]
|
| 58 |
+
},
|
| 59 |
+
{
|
| 60 |
+
"cell_type": "code",
|
| 61 |
+
"execution_count": 5,
|
| 62 |
+
"metadata": {},
|
| 63 |
+
"outputs": [],
|
| 64 |
+
"source": [
|
| 65 |
+
"from summarylm.constants import *\n",
|
| 66 |
+
"from summarylm.utils.common import read_yaml, create_directories"
|
| 67 |
+
]
|
| 68 |
+
},
|
| 69 |
+
{
|
| 70 |
+
"cell_type": "code",
|
| 71 |
+
"execution_count": 7,
|
| 72 |
+
"metadata": {},
|
| 73 |
+
"outputs": [],
|
| 74 |
+
"source": [
|
| 75 |
+
"class ConfigurationManager:\n",
|
| 76 |
+
" def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH):\n",
|
| 77 |
+
" self.config = read_yaml(config_filepath)\n",
|
| 78 |
+
" self.params = read_yaml(params_filepath)\n",
|
| 79 |
+
"\n",
|
| 80 |
+
" create_directories([self.config.artifacts_root])\n",
|
| 81 |
+
"\n",
|
| 82 |
+
" def get_model_evaluation_config(self) -> ModelEvaluationConfig:\n",
|
| 83 |
+
" config = self.config.model_evaluation\n",
|
| 84 |
+
"\n",
|
| 85 |
+
" create_directories([config.root_dir])\n",
|
| 86 |
+
"\n",
|
| 87 |
+
" model_evaluation_config = ModelEvaluationConfig(\n",
|
| 88 |
+
" root_dir=config.root_dir,\n",
|
| 89 |
+
" data_path=config.data_path,\n",
|
| 90 |
+
" model_path=config.model_path,\n",
|
| 91 |
+
" tokenizer_path=config.tokenizer_path,\n",
|
| 92 |
+
" metric_file_name=config.metric_file_name,\n",
|
| 93 |
+
" )\n",
|
| 94 |
+
"\n",
|
| 95 |
+
" return model_evaluation_config"
|
| 96 |
+
]
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"cell_type": "code",
|
| 100 |
+
"execution_count": 9,
|
| 101 |
+
"metadata": {},
|
| 102 |
+
"outputs": [
|
| 103 |
+
{
|
| 104 |
+
"name": "stdout",
|
| 105 |
+
"output_type": "stream",
|
| 106 |
+
"text": [
|
| 107 |
+
"[2024-05-21 08:29:30,191: INFO: config: PyTorch version 2.3.0 available.]\n"
|
| 108 |
+
]
|
| 109 |
+
}
|
| 110 |
+
],
|
| 111 |
+
"source": [
|
| 112 |
+
"from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
|
| 113 |
+
"from datasets import load_dataset, load_from_disk, load_metric\n",
|
| 114 |
+
"import torch\n",
|
| 115 |
+
"import pandas as pd\n",
|
| 116 |
+
"from tqdm import tqdm"
|
| 117 |
+
]
|
| 118 |
+
},
|
| 119 |
+
{
|
| 120 |
+
"cell_type": "code",
|
| 121 |
+
"execution_count": 10,
|
| 122 |
+
"metadata": {},
|
| 123 |
+
"outputs": [],
|
| 124 |
+
"source": [
|
| 125 |
+
"class ModelEvaluation:\n",
|
| 126 |
+
" def __init__(self, config: ModelEvaluationConfig):\n",
|
| 127 |
+
" self.config = config\n",
|
| 128 |
+
" \n",
|
| 129 |
+
" def generate_batch_size_chunks(self, list_of_elements, batch_size):\n",
|
| 130 |
+
" \"\"\"\n",
|
| 131 |
+
" Split the dataset into smaller batches that we can process simultaneously\n",
|
| 132 |
+
" Yield successive batch-sized chunks from list_of_elements.\n",
|
| 133 |
+
" \"\"\"\n",
|
| 134 |
+
" for i in range(0, len(list_of_elements), batch_size):\n",
|
| 135 |
+
" yield list_of_elements[i : i + batch_size]\n",
|
| 136 |
+
" \n",
|
| 137 |
+
" def calculate_metric_on_test_ds(self, dataset, metric, model, tokenizer, batch_size=16,\n",
|
| 138 |
+
" device=\"cuda\" if torch.cuda.is_available() else \"cpu\",\n",
|
| 139 |
+
" column_text=\"article\",\n",
|
| 140 |
+
" column_summary=\"highlights\"):\n",
|
| 141 |
+
" article_batches = list(self.generate_batch_size_chunks(dataset[column_text], batch_size))\n",
|
| 142 |
+
" target_batches = list(self.generate_batch_size_chunks(dataset[column_summary], batch_size))\n",
|
| 143 |
+
" \n",
|
| 144 |
+
" for article_batch, target_batch in tqdm(\n",
|
| 145 |
+
" zip(article_batches, target_batches), total=len(article_batches)):\n",
|
| 146 |
+
" \n",
|
| 147 |
+
" inputs = tokenizer(article_batch, max_length=1024, truncation=True, \n",
|
| 148 |
+
" padding=\"max_length\", return_tensors=\"pt\")\n",
|
| 149 |
+
" \n",
|
| 150 |
+
" summaries = model.generate(input_ids=inputs[\"input_ids\"].to(device),\n",
|
| 151 |
+
" attention_mask=inputs[\"attention_mask\"].to(device), \n",
|
| 152 |
+
" length_penalty=0.8, num_beams=8, max_length=128)\n",
|
| 153 |
+
" \n",
|
| 154 |
+
" ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''\n",
|
| 155 |
+
" \n",
|
| 156 |
+
" # Finally, we decode the generated texts, \n",
|
| 157 |
+
" # replace the token, and add the decoded texts with the references to the metric.\n",
|
| 158 |
+
" decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, \n",
|
| 159 |
+
" clean_up_tokenization_spaces=True) for s in summaries] \n",
|
| 160 |
+
" \n",
|
| 161 |
+
" decoded_summaries = [d.replace(\"\", \" \") for d in decoded_summaries]\n",
|
| 162 |
+
" \n",
|
| 163 |
+
" \n",
|
| 164 |
+
" metric.add_batch(predictions=decoded_summaries, references=target_batch)\n",
|
| 165 |
+
" \n",
|
| 166 |
+
" # Finally compute and return the ROUGE scores.\n",
|
| 167 |
+
" score = metric.compute()\n",
|
| 168 |
+
" return score\n",
|
| 169 |
+
" \n",
|
| 170 |
+
" def evaluation(self):\n",
|
| 171 |
+
" device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
|
| 172 |
+
" tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)\n",
|
| 173 |
+
" model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path).to(device)\n",
|
| 174 |
+
"\n",
|
| 175 |
+
" # loading data\n",
|
| 176 |
+
" dataset_pt = load_from_disk(self.config.data_path)\n",
|
| 177 |
+
"\n",
|
| 178 |
+
" rouge_names = [\"rouge1\", \"rouge2\", \"rougeL\", \"rougeLsum\"]\n",
|
| 179 |
+
" \n",
|
| 180 |
+
" rouge_metric = load_metric('rouge')\n",
|
| 181 |
+
"\n",
|
| 182 |
+
" score = self.calculate_metric_on_test_ds(dataset_pt['test'][0:10], rouge_metric, model_pegasus, tokenizer, batch_size = 2, column_text = 'text', column_summary= 'summary')\n",
|
| 183 |
+
" rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )\n",
|
| 184 |
+
" df = pd.DataFrame(rouge_dict, index = ['pegasus'] )\n",
|
| 185 |
+
" df.to_csv(self.config.metric_file_name, index=False)"
|
| 186 |
+
]
|
| 187 |
+
},
|
| 188 |
+
{
|
| 189 |
+
"cell_type": "code",
|
| 190 |
+
"execution_count": 12,
|
| 191 |
+
"metadata": {},
|
| 192 |
+
"outputs": [
|
| 193 |
+
{
|
| 194 |
+
"name": "stdout",
|
| 195 |
+
"output_type": "stream",
|
| 196 |
+
"text": [
|
| 197 |
+
"[2024-05-21 08:43:47,280: INFO: common: Yaml file: config\\config.yaml loaded successfully]\n",
|
| 198 |
+
"[2024-05-21 08:43:47,284: INFO: common: Yaml file: params.yaml loaded successfully]\n",
|
| 199 |
+
"[2024-05-21 08:43:47,285: INFO: common: Directory created successfully at: artifacts]\n",
|
| 200 |
+
"[2024-05-21 08:43:47,286: INFO: common: Directory created successfully at: artifacts/model_evaluation]\n"
|
| 201 |
+
]
|
| 202 |
+
},
|
| 203 |
+
{
|
| 204 |
+
"ename": "OSError",
|
| 205 |
+
"evalue": "Incorrect path_or_model_id: 'artifacts/model_trainer/tokenizer'. Please provide either the path to a local folder or the repo_id of a model on the Hub.",
|
| 206 |
+
"output_type": "error",
|
| 207 |
+
"traceback": [
|
| 208 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
| 209 |
+
"\u001b[1;31mHFValidationError\u001b[0m Traceback (most recent call last)",
|
| 210 |
+
"File \u001b[1;32md:\\Satyam Mishra\\NLP Project\\Text Summarization\\env\\Lib\\site-packages\\transformers\\utils\\hub.py:398\u001b[0m, in \u001b[0;36mcached_file\u001b[1;34m(path_or_repo_id, filename, cache_dir, force_download, resume_download, proxies, token, revision, local_files_only, subfolder, repo_type, user_agent, _raise_exceptions_for_gated_repo, _raise_exceptions_for_missing_entries, _raise_exceptions_for_connection_errors, _commit_hash, **deprecated_kwargs)\u001b[0m\n\u001b[0;32m 396\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 397\u001b[0m \u001b[38;5;66;03m# Load from URL or cache if already cached\u001b[39;00m\n\u001b[1;32m--> 398\u001b[0m resolved_file \u001b[38;5;241m=\u001b[39m \u001b[43mhf_hub_download\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 399\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath_or_repo_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 400\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 401\u001b[0m \u001b[43m \u001b[49m\u001b[43msubfolder\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43msubfolder\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43msubfolder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 402\u001b[0m \u001b[43m \u001b[49m\u001b[43mrepo_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrepo_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 403\u001b[0m \u001b[43m \u001b[49m\u001b[43mrevision\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 404\u001b[0m \u001b[43m \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 405\u001b[0m \u001b[43m \u001b[49m\u001b[43muser_agent\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muser_agent\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 406\u001b[0m \u001b[43m \u001b[49m\u001b[43mforce_download\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mforce_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 407\u001b[0m \u001b[43m \u001b[49m\u001b[43mproxies\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mproxies\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 408\u001b[0m \u001b[43m \u001b[49m\u001b[43mresume_download\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 409\u001b[0m \u001b[43m \u001b[49m\u001b[43mtoken\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 410\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 411\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 412\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m GatedRepoError \u001b[38;5;28;01mas\u001b[39;00m e:\n",
|
| 211 |
+
"File \u001b[1;32md:\\Satyam Mishra\\NLP Project\\Text Summarization\\env\\Lib\\site-packages\\huggingface_hub\\utils\\_validators.py:106\u001b[0m, in \u001b[0;36mvalidate_hf_hub_args.<locals>._inner_fn\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 105\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m arg_name \u001b[38;5;129;01min\u001b[39;00m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrepo_id\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfrom_id\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mto_id\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n\u001b[1;32m--> 106\u001b[0m \u001b[43mvalidate_repo_id\u001b[49m\u001b[43m(\u001b[49m\u001b[43marg_value\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 108\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m arg_name \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtoken\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m arg_value \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
|
| 212 |
+
"File \u001b[1;32md:\\Satyam Mishra\\NLP Project\\Text Summarization\\env\\Lib\\site-packages\\huggingface_hub\\utils\\_validators.py:154\u001b[0m, in \u001b[0;36mvalidate_repo_id\u001b[1;34m(repo_id)\u001b[0m\n\u001b[0;32m 153\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m repo_id\u001b[38;5;241m.\u001b[39mcount(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m--> 154\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m HFValidationError(\n\u001b[0;32m 155\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRepo id must be in the form \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrepo_name\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m or \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mnamespace/repo_name\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m:\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 156\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrepo_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m. Use `repo_type` argument if needed.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 157\u001b[0m )\n\u001b[0;32m 159\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m REPO_ID_REGEX\u001b[38;5;241m.\u001b[39mmatch(repo_id):\n",
|
| 213 |
+
"\u001b[1;31mHFValidationError\u001b[0m: Repo id must be in the form 'repo_name' or 'namespace/repo_name': 'artifacts/model_trainer/tokenizer'. Use `repo_type` argument if needed.",
|
| 214 |
+
"\nThe above exception was the direct cause of the following exception:\n",
|
| 215 |
+
"\u001b[1;31mOSError\u001b[0m Traceback (most recent call last)",
|
| 216 |
+
"Cell \u001b[1;32mIn[12], line 7\u001b[0m\n\u001b[0;32m 5\u001b[0m model_evaluation_config\u001b[38;5;241m.\u001b[39mevaluation()\n\u001b[0;32m 6\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m----> 7\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n",
|
| 217 |
+
"Cell \u001b[1;32mIn[12], line 5\u001b[0m\n\u001b[0;32m 3\u001b[0m model_evaluation_config \u001b[38;5;241m=\u001b[39m config\u001b[38;5;241m.\u001b[39mget_model_evaluation_config()\n\u001b[0;32m 4\u001b[0m model_evaluation_config \u001b[38;5;241m=\u001b[39m ModelEvaluation(config\u001b[38;5;241m=\u001b[39mmodel_evaluation_config)\n\u001b[1;32m----> 5\u001b[0m \u001b[43mmodel_evaluation_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mevaluation\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 6\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m 7\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n",
|
| 218 |
+
"Cell \u001b[1;32mIn[10], line 48\u001b[0m, in \u001b[0;36mModelEvaluation.evaluation\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 46\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mevaluation\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m 47\u001b[0m device \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcuda\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mcuda\u001b[38;5;241m.\u001b[39mis_available() \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcpu\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m---> 48\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m \u001b[43mAutoTokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtokenizer_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 49\u001b[0m model_pegasus \u001b[38;5;241m=\u001b[39m AutoModelForSeq2SeqLM\u001b[38;5;241m.\u001b[39mfrom_pretrained(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mmodel_path)\u001b[38;5;241m.\u001b[39mto(device)\n\u001b[0;32m 51\u001b[0m \u001b[38;5;66;03m# loading data\u001b[39;00m\n",
|
| 219 |
+
"File \u001b[1;32md:\\Satyam Mishra\\NLP Project\\Text Summarization\\env\\Lib\\site-packages\\transformers\\models\\auto\\tokenization_auto.py:804\u001b[0m, in \u001b[0;36mAutoTokenizer.from_pretrained\u001b[1;34m(cls, pretrained_model_name_or_path, *inputs, **kwargs)\u001b[0m\n\u001b[0;32m 801\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m tokenizer_class\u001b[38;5;241m.\u001b[39mfrom_pretrained(pretrained_model_name_or_path, \u001b[38;5;241m*\u001b[39minputs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 803\u001b[0m \u001b[38;5;66;03m# Next, let's try to use the tokenizer_config file to get the tokenizer class.\u001b[39;00m\n\u001b[1;32m--> 804\u001b[0m tokenizer_config \u001b[38;5;241m=\u001b[39m \u001b[43mget_tokenizer_config\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 805\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_commit_hash\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m tokenizer_config:\n\u001b[0;32m 806\u001b[0m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_commit_hash\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m tokenizer_config[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_commit_hash\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n",
|
| 220 |
+
"File \u001b[1;32md:\\Satyam Mishra\\NLP Project\\Text Summarization\\env\\Lib\\site-packages\\transformers\\models\\auto\\tokenization_auto.py:637\u001b[0m, in \u001b[0;36mget_tokenizer_config\u001b[1;34m(pretrained_model_name_or_path, cache_dir, force_download, resume_download, proxies, token, revision, local_files_only, subfolder, **kwargs)\u001b[0m\n\u001b[0;32m 634\u001b[0m token \u001b[38;5;241m=\u001b[39m use_auth_token\n\u001b[0;32m 636\u001b[0m commit_hash \u001b[38;5;241m=\u001b[39m kwargs\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_commit_hash\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m--> 637\u001b[0m resolved_config_file \u001b[38;5;241m=\u001b[39m \u001b[43mcached_file\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 638\u001b[0m \u001b[43m \u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 639\u001b[0m \u001b[43m \u001b[49m\u001b[43mTOKENIZER_CONFIG_FILE\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 640\u001b[0m \u001b[43m \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 641\u001b[0m \u001b[43m \u001b[49m\u001b[43mforce_download\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mforce_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 642\u001b[0m \u001b[43m \u001b[49m\u001b[43mresume_download\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 643\u001b[0m \u001b[43m \u001b[49m\u001b[43mproxies\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mproxies\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 644\u001b[0m \u001b[43m \u001b[49m\u001b[43mtoken\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 645\u001b[0m \u001b[43m \u001b[49m\u001b[43mrevision\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 646\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 647\u001b[0m \u001b[43m \u001b[49m\u001b[43msubfolder\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msubfolder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 648\u001b[0m \u001b[43m \u001b[49m\u001b[43m_raise_exceptions_for_gated_repo\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 649\u001b[0m \u001b[43m \u001b[49m\u001b[43m_raise_exceptions_for_missing_entries\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 650\u001b[0m \u001b[43m \u001b[49m\u001b[43m_raise_exceptions_for_connection_errors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 651\u001b[0m \u001b[43m \u001b[49m\u001b[43m_commit_hash\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcommit_hash\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 652\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 653\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m resolved_config_file \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 654\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCould not locate the tokenizer configuration file, will try to use the model config instead.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
|
| 221 |
+
"File \u001b[1;32md:\\Satyam Mishra\\NLP Project\\Text Summarization\\env\\Lib\\site-packages\\transformers\\utils\\hub.py:462\u001b[0m, in \u001b[0;36mcached_file\u001b[1;34m(path_or_repo_id, filename, cache_dir, force_download, resume_download, proxies, token, revision, local_files_only, subfolder, repo_type, user_agent, _raise_exceptions_for_gated_repo, _raise_exceptions_for_missing_entries, _raise_exceptions_for_connection_errors, _commit_hash, **deprecated_kwargs)\u001b[0m\n\u001b[0;32m 460\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mEnvironmentError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThere was a specific connection error when trying to load \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpath_or_repo_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00merr\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 461\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m HFValidationError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m--> 462\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mEnvironmentError\u001b[39;00m(\n\u001b[0;32m 463\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIncorrect path_or_model_id: \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpath_or_repo_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m. Please provide either the path to a local folder or the repo_id of a model on the Hub.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 464\u001b[0m ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n\u001b[0;32m 465\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m resolved_file\n",
|
| 222 |
+
"\u001b[1;31mOSError\u001b[0m: Incorrect path_or_model_id: 'artifacts/model_trainer/tokenizer'. Please provide either the path to a local folder or the repo_id of a model on the Hub."
|
| 223 |
+
]
|
| 224 |
+
}
|
| 225 |
+
],
|
| 226 |
+
"source": [
|
| 227 |
+
"try:\n",
|
| 228 |
+
" config = ConfigurationManager()\n",
|
| 229 |
+
" model_evaluation_config = config.get_model_evaluation_config()\n",
|
| 230 |
+
" model_evaluation_config = ModelEvaluation(config=model_evaluation_config)\n",
|
| 231 |
+
" model_evaluation_config.evaluation()\n",
|
| 232 |
+
"except Exception as e:\n",
|
| 233 |
+
" raise e"
|
| 234 |
+
]
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
"cell_type": "code",
|
| 238 |
+
"execution_count": null,
|
| 239 |
+
"metadata": {},
|
| 240 |
+
"outputs": [],
|
| 241 |
+
"source": []
|
| 242 |
+
}
|
| 243 |
+
],
|
| 244 |
+
"metadata": {
|
| 245 |
+
"kernelspec": {
|
| 246 |
+
"display_name": "env",
|
| 247 |
+
"language": "python",
|
| 248 |
+
"name": "python3"
|
| 249 |
+
},
|
| 250 |
+
"language_info": {
|
| 251 |
+
"codemirror_mode": {
|
| 252 |
+
"name": "ipython",
|
| 253 |
+
"version": 3
|
| 254 |
+
},
|
| 255 |
+
"file_extension": ".py",
|
| 256 |
+
"mimetype": "text/x-python",
|
| 257 |
+
"name": "python",
|
| 258 |
+
"nbconvert_exporter": "python",
|
| 259 |
+
"pygments_lexer": "ipython3",
|
| 260 |
+
"version": "3.12.2"
|
| 261 |
+
}
|
| 262 |
+
},
|
| 263 |
+
"nbformat": 4,
|
| 264 |
+
"nbformat_minor": 2
|
| 265 |
+
}
|
research/model_trainer.ipynb
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"outputs": [],
|
| 8 |
+
"source": [
|
| 9 |
+
"import os"
|
| 10 |
+
]
|
| 11 |
+
},
|
| 12 |
+
{
|
| 13 |
+
"cell_type": "code",
|
| 14 |
+
"execution_count": 2,
|
| 15 |
+
"metadata": {},
|
| 16 |
+
"outputs": [],
|
| 17 |
+
"source": [
|
| 18 |
+
"os.chdir('../')"
|
| 19 |
+
]
|
| 20 |
+
},
|
| 21 |
+
{
|
| 22 |
+
"cell_type": "code",
|
| 23 |
+
"execution_count": 3,
|
| 24 |
+
"metadata": {},
|
| 25 |
+
"outputs": [
|
| 26 |
+
{
|
| 27 |
+
"data": {
|
| 28 |
+
"text/plain": [
|
| 29 |
+
"'d:\\\\Satyam Mishra\\\\NLP Project\\\\Text Summarization'"
|
| 30 |
+
]
|
| 31 |
+
},
|
| 32 |
+
"execution_count": 3,
|
| 33 |
+
"metadata": {},
|
| 34 |
+
"output_type": "execute_result"
|
| 35 |
+
}
|
| 36 |
+
],
|
| 37 |
+
"source": [
|
| 38 |
+
"%pwd"
|
| 39 |
+
]
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"cell_type": "code",
|
| 43 |
+
"execution_count": 5,
|
| 44 |
+
"metadata": {},
|
| 45 |
+
"outputs": [],
|
| 46 |
+
"source": [
|
| 47 |
+
"from dataclasses import dataclass\n",
|
| 48 |
+
"from pathlib import Path\n",
|
| 49 |
+
"\n",
|
| 50 |
+
"@dataclass(frozen=True)\n",
|
| 51 |
+
"class ModelTrainerConfig:\n",
|
| 52 |
+
" root_dir: Path\n",
|
| 53 |
+
" data_path: Path\n",
|
| 54 |
+
" model_ckpt: Path\n",
|
| 55 |
+
" num_train_epochs: int\n",
|
| 56 |
+
" warmup_steps: int\n",
|
| 57 |
+
" per_device_train_batch_size: int\n",
|
| 58 |
+
" weight_decay: float\n",
|
| 59 |
+
" logging_steps: int\n",
|
| 60 |
+
" evaluation_strategy: str\n",
|
| 61 |
+
" eval_steps: int\n",
|
| 62 |
+
" save_steps: float\n",
|
| 63 |
+
" gradient_accumulation_steps: int"
|
| 64 |
+
]
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"cell_type": "code",
|
| 68 |
+
"execution_count": 6,
|
| 69 |
+
"metadata": {},
|
| 70 |
+
"outputs": [],
|
| 71 |
+
"source": [
|
| 72 |
+
"from summarylm.constants import *\n",
|
| 73 |
+
"from summarylm.utils.common import read_yaml, create_directories"
|
| 74 |
+
]
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"cell_type": "code",
|
| 78 |
+
"execution_count": 9,
|
| 79 |
+
"metadata": {},
|
| 80 |
+
"outputs": [],
|
| 81 |
+
"source": [
|
| 82 |
+
"class ConfigurationManager:\n",
|
| 83 |
+
" def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH):\n",
|
| 84 |
+
" self.config = read_yaml(config_filepath)\n",
|
| 85 |
+
" self.params = read_yaml(params_filepath)\n",
|
| 86 |
+
"\n",
|
| 87 |
+
" create_directories([self.config.artifacts_root])\n",
|
| 88 |
+
"\n",
|
| 89 |
+
" def get_model_trainer_config(self) -> ModelTrainerConfig:\n",
|
| 90 |
+
" config = self.config.model_trainer\n",
|
| 91 |
+
" params = self.params.TrainingArguments\n",
|
| 92 |
+
"\n",
|
| 93 |
+
" create_directories([config.root_dir])\n",
|
| 94 |
+
"\n",
|
| 95 |
+
" model_trainer_config = ModelTrainerConfig(\n",
|
| 96 |
+
" root_dir=config.root_dir,\n",
|
| 97 |
+
" data_path=config.data_path,\n",
|
| 98 |
+
" model_ckpt=config.model_ckpt,\n",
|
| 99 |
+
" num_train_epochs=params.num_train_epochs,\n",
|
| 100 |
+
" warmup_steps=params.warmup_steps,\n",
|
| 101 |
+
" per_device_train_batch_size=params.per_device_train_batch_size,\n",
|
| 102 |
+
" weight_decay=params.weight_decay,\n",
|
| 103 |
+
" logging_steps=params.logging_steps,\n",
|
| 104 |
+
" evaluation_strategy=params.evaluation_strategy,\n",
|
| 105 |
+
" eval_steps=params.eval_steps,\n",
|
| 106 |
+
" save_steps=params.save_steps,\n",
|
| 107 |
+
" gradient_accumulation_steps=params.gradient_accumulation_steps,\n",
|
| 108 |
+
" )\n",
|
| 109 |
+
"\n",
|
| 110 |
+
" return model_trainer_config"
|
| 111 |
+
]
|
| 112 |
+
},
|
| 113 |
+
{
|
| 114 |
+
"cell_type": "code",
|
| 115 |
+
"execution_count": 11,
|
| 116 |
+
"metadata": {},
|
| 117 |
+
"outputs": [],
|
| 118 |
+
"source": [
|
| 119 |
+
"from transformers import TrainingArguments, Trainer\n",
|
| 120 |
+
"from transformers import DataCollatorForSeq2Seq\n",
|
| 121 |
+
"from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
|
| 122 |
+
"from datasets import load_dataset, load_from_disk\n",
|
| 123 |
+
"import torch"
|
| 124 |
+
]
|
| 125 |
+
},
|
| 126 |
+
{
|
| 127 |
+
"cell_type": "code",
|
| 128 |
+
"execution_count": 12,
|
| 129 |
+
"metadata": {},
|
| 130 |
+
"outputs": [],
|
| 131 |
+
"source": [
|
| 132 |
+
"class ModelTrainer:\n",
|
| 133 |
+
" def __init__(self, config: ModelTrainerConfig):\n",
|
| 134 |
+
" self.config = config\n",
|
| 135 |
+
"\n",
|
| 136 |
+
" def train(self):\n",
|
| 137 |
+
" device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
|
| 138 |
+
" tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)\n",
|
| 139 |
+
" model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)\n",
|
| 140 |
+
" seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)\n",
|
| 141 |
+
"\n",
|
| 142 |
+
" # loading the dataset\n",
|
| 143 |
+
" dataset_pt = load_from_disk(self.config.data_path)\n",
|
| 144 |
+
"\n",
|
| 145 |
+
" trainer_args = TrainingArguments(\n",
|
| 146 |
+
" output_dir=self.config.root_dir,\n",
|
| 147 |
+
" num_train_epochs=self.config.num_train_epochs,\n",
|
| 148 |
+
" warmup_steps=self.config.warmup_steps,\n",
|
| 149 |
+
" per_device_train_batch_size=self.config.per_device_train_batch_size,\n",
|
| 150 |
+
" per_device_eval_batch_size=self.config.per_device_train_batch_size,\n",
|
| 151 |
+
" weight_decay=self.config.weight_decay,\n",
|
| 152 |
+
" logging_steps=self.config.logging_steps,\n",
|
| 153 |
+
" evaluation_strategy=self.config.evaluation_strategy,\n",
|
| 154 |
+
" eval_steps=self.config.eval_steps,\n",
|
| 155 |
+
" save_steps=self.config.save_steps,\n",
|
| 156 |
+
" gradient_accumulation_steps=self.config.gradient_accumulation_steps,\n",
|
| 157 |
+
" )\n",
|
| 158 |
+
"\n",
|
| 159 |
+
" trainer = Trainer(\n",
|
| 160 |
+
" model=model_pegasus,\n",
|
| 161 |
+
" args=trainer_args,\n",
|
| 162 |
+
" tokenizer=tokenizer, \n",
|
| 163 |
+
" data_collator=seq2seq_data_collator,\n",
|
| 164 |
+
" train_dataset=dataset_pt['train'],\n",
|
| 165 |
+
" eval_dataset=dataset_pt['validation']\n",
|
| 166 |
+
" )\n",
|
| 167 |
+
"\n",
|
| 168 |
+
" trainer.train()\n",
|
| 169 |
+
"\n",
|
| 170 |
+
" ## Save model\n",
|
| 171 |
+
" model_pegasus.save_pretrained(os.path.join(self.config.root_dir,\"pegasus-summary-lm\"))\n",
|
| 172 |
+
" ## Save tokenizer\n",
|
| 173 |
+
" tokenizer.save_pretrained(os.path.join(self.config.root_dir,\"tokenizer\"))"
|
| 174 |
+
]
|
| 175 |
+
},
|
| 176 |
+
{
|
| 177 |
+
"cell_type": "code",
|
| 178 |
+
"execution_count": 13,
|
| 179 |
+
"metadata": {},
|
| 180 |
+
"outputs": [
|
| 181 |
+
{
|
| 182 |
+
"name": "stdout",
|
| 183 |
+
"output_type": "stream",
|
| 184 |
+
"text": [
|
| 185 |
+
"[2024-05-21 07:37:38,704: INFO: common: Yaml file: config\\config.yaml loaded successfully]\n",
|
| 186 |
+
"[2024-05-21 07:37:38,721: INFO: common: Yaml file: params.yaml loaded successfully]\n",
|
| 187 |
+
"[2024-05-21 07:37:38,739: INFO: common: Directory created successfully at: artifacts]\n",
|
| 188 |
+
"[2024-05-21 07:37:38,742: INFO: common: Directory created successfully at: artifacts/model_trainer]\n"
|
| 189 |
+
]
|
| 190 |
+
},
|
| 191 |
+
{
|
| 192 |
+
"name": "stderr",
|
| 193 |
+
"output_type": "stream",
|
| 194 |
+
"text": [
|
| 195 |
+
"d:\\Satyam Mishra\\NLP Project\\Text Summarization\\env\\Lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
|
| 196 |
+
" warnings.warn(\n"
|
| 197 |
+
]
|
| 198 |
+
}
|
| 199 |
+
],
|
| 200 |
+
"source": [
|
| 201 |
+
"try:\n",
|
| 202 |
+
" config = ConfigurationManager()\n",
|
| 203 |
+
" model_trainer_config = config.get_model_trainer_config()\n",
|
| 204 |
+
" model_trainer_config = ModelTrainer(config=model_trainer_config)\n",
|
| 205 |
+
" model_trainer_config.train()\n",
|
| 206 |
+
"except Exception as e:\n",
|
| 207 |
+
" raise e"
|
| 208 |
+
]
|
| 209 |
+
},
|
| 210 |
+
{
|
| 211 |
+
"cell_type": "code",
|
| 212 |
+
"execution_count": null,
|
| 213 |
+
"metadata": {},
|
| 214 |
+
"outputs": [],
|
| 215 |
+
"source": []
|
| 216 |
+
}
|
| 217 |
+
],
|
| 218 |
+
"metadata": {
|
| 219 |
+
"kernelspec": {
|
| 220 |
+
"display_name": "env",
|
| 221 |
+
"language": "python",
|
| 222 |
+
"name": "python3"
|
| 223 |
+
},
|
| 224 |
+
"language_info": {
|
| 225 |
+
"codemirror_mode": {
|
| 226 |
+
"name": "ipython",
|
| 227 |
+
"version": 3
|
| 228 |
+
},
|
| 229 |
+
"file_extension": ".py",
|
| 230 |
+
"mimetype": "text/x-python",
|
| 231 |
+
"name": "python",
|
| 232 |
+
"nbconvert_exporter": "python",
|
| 233 |
+
"pygments_lexer": "ipython3",
|
| 234 |
+
"version": "3.12.2"
|
| 235 |
+
}
|
| 236 |
+
},
|
| 237 |
+
"nbformat": 4,
|
| 238 |
+
"nbformat_minor": 2
|
| 239 |
+
}
|
setup.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import setuptools
|
| 2 |
+
|
| 3 |
+
with open("README.md", "r", encoding="utf-8") as f:
|
| 4 |
+
long_description = f.read()
|
| 5 |
+
|
| 6 |
+
__version__ = "0.0.0"
|
| 7 |
+
|
| 8 |
+
AUTHOR_USER_NAME = "Satyam Mishra"
|
| 9 |
+
SRC_REPO = "SummaryLM"
|
| 10 |
+
AUTHOR_EMAIL = "[email protected]"
|
| 11 |
+
|
| 12 |
+
setuptools.setup(
|
| 13 |
+
name=SRC_REPO,
|
| 14 |
+
version=__version__,
|
| 15 |
+
author=AUTHOR_USER_NAME,
|
| 16 |
+
author_email=AUTHOR_EMAIL,
|
| 17 |
+
description='A text summarizer',
|
| 18 |
+
long_description=long_description,
|
| 19 |
+
long_description_content_type="text/markdown",
|
| 20 |
+
package_dir={"": "src"},
|
| 21 |
+
packages=setuptools.find_packages(where="src"),
|
| 22 |
+
)
|
src/summarylm/__init__.py
ADDED
|
File without changes
|
src/summarylm/components/__init__.py
ADDED
|
File without changes
|
src/summarylm/components/data_ingestion.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import zipfile
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from datasets import load_dataset
|
| 6 |
+
|
| 7 |
+
from summarylm.entity import DataIngestionConfig
|
| 8 |
+
from summarylm.utils.common import get_size
|
| 9 |
+
from summarylm.logging import logger
|
| 10 |
+
from summarylm.exception import CustomException
|
| 11 |
+
|
| 12 |
+
class DataIngestion:
|
| 13 |
+
"""
|
| 14 |
+
Class for download and unzip data and store it into artifact folder
|
| 15 |
+
|
| 16 |
+
Args:
|
| 17 |
+
config (DataIngestionConfig): Contain all config for data ingestion
|
| 18 |
+
|
| 19 |
+
Returns:
|
| 20 |
+
None
|
| 21 |
+
"""
|
| 22 |
+
def __init__(self, config: DataIngestionConfig):
|
| 23 |
+
self.config = config
|
| 24 |
+
|
| 25 |
+
def download_data(self):
|
| 26 |
+
"""
|
| 27 |
+
Function to download data from gcloud
|
| 28 |
+
"""
|
| 29 |
+
try:
|
| 30 |
+
for i in range(len(self.config.LOCAL_DATA_FILE)):
|
| 31 |
+
if not os.path.exists(self.config.LOCAL_DATA_FILE[i]):
|
| 32 |
+
dataset = load_dataset(self.config.ALL_HUGGINGFACE_DATA[i])
|
| 33 |
+
dataset.save_to_disk(self.config.LOCAL_DATA_FILE[i])
|
| 34 |
+
logger.info(f"{self.config.ALL_HUGGINGFACE_DATA[i]} downloaded!")
|
| 35 |
+
else:
|
| 36 |
+
logger.info(f"File already exists of size: {get_size(Path(self.config.LOCAL_DATA_FILE[i]))}")
|
| 37 |
+
|
| 38 |
+
except Exception as e:
|
| 39 |
+
raise CustomException(e, sys) from e
|
src/summarylm/components/data_transformation.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
from summarylm.logging import logger
|
| 4 |
+
from summarylm.exception import CustomException
|
| 5 |
+
from summarylm.entity import DataTransformationConfig
|
| 6 |
+
from transformers import AutoTokenizer
|
| 7 |
+
from datasets import load_dataset, load_from_disk, concatenate_datasets, DatasetDict
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class DataTransformation:
|
| 11 |
+
"""
|
| 12 |
+
Class for transforming data into valid format for training
|
| 13 |
+
|
| 14 |
+
Args:
|
| 15 |
+
config (DataTransformationConfig): Contain all config for data transformation
|
| 16 |
+
"""
|
| 17 |
+
def __init__(self, config: DataTransformationConfig):
|
| 18 |
+
self.config = config
|
| 19 |
+
self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)
|
| 20 |
+
|
| 21 |
+
def convert_data_into_right_format(self, datasets: list) -> DatasetDict:
|
| 22 |
+
"""
|
| 23 |
+
Function to remove & rename columns and convert it into right format to train
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
datasets (list): list of all dataset path
|
| 27 |
+
|
| 28 |
+
Returns:
|
| 29 |
+
DatasetDict: Contains train, test, and validation sets
|
| 30 |
+
"""
|
| 31 |
+
try:
|
| 32 |
+
logger.info("Entered convert_data_into_right_format method of DataTransformation class.")
|
| 33 |
+
# loading all datasets
|
| 34 |
+
loaded_datasets = {}
|
| 35 |
+
print("Loading the dataset")
|
| 36 |
+
for data in datasets:
|
| 37 |
+
loaded_datasets[data] = load_from_disk(data)
|
| 38 |
+
|
| 39 |
+
dataset1 = loaded_datasets[datasets[0]]
|
| 40 |
+
dataset2 = loaded_datasets[datasets[1]]
|
| 41 |
+
print("Dataset loaded")
|
| 42 |
+
|
| 43 |
+
# removing unwanted columns from dataset1
|
| 44 |
+
dataset1_train = dataset1['train'].select_columns(['article', 'summary'])
|
| 45 |
+
dataset1_test = dataset1['test'].select_columns(['article', 'summary'])
|
| 46 |
+
dataset1_validation = dataset1['validation'].select_columns(['article', 'summary'])
|
| 47 |
+
|
| 48 |
+
# renaming data column name of dataset1
|
| 49 |
+
dataset1_train = dataset1_train.rename_column('article', 'text')
|
| 50 |
+
dataset1_test = dataset1_test.rename_column('article', 'text')
|
| 51 |
+
dataset1_validation = dataset1_validation.rename_column('article', 'text')
|
| 52 |
+
|
| 53 |
+
# renaming data column name of dataset2
|
| 54 |
+
dataset2_train = dataset2['train'].rename_column('document', 'text')
|
| 55 |
+
dataset2_test = dataset2['test'].rename_column('document', 'text')
|
| 56 |
+
dataset2_validation = dataset2['validation'].rename_column('document', 'text')
|
| 57 |
+
|
| 58 |
+
# concatenate_datasets
|
| 59 |
+
dataset_train = concatenate_datasets([dataset1_train, dataset2_train])
|
| 60 |
+
dataset_test = concatenate_datasets([dataset1_test, dataset2_test])
|
| 61 |
+
dataset_validation = concatenate_datasets([dataset1_validation, dataset2_validation])
|
| 62 |
+
|
| 63 |
+
# loading the dataset into DatasetDict
|
| 64 |
+
dataset = DatasetDict({
|
| 65 |
+
"train": dataset_train,
|
| 66 |
+
"validation": dataset_validation,
|
| 67 |
+
"test": dataset_test,
|
| 68 |
+
})
|
| 69 |
+
return dataset
|
| 70 |
+
|
| 71 |
+
except Exception as e:
|
| 72 |
+
raise CustomException(e, sys) from e
|
| 73 |
+
|
| 74 |
+
def convert_examples_to_features(self, example_batch):
|
| 75 |
+
"""
|
| 76 |
+
Method to convert data into data into features
|
| 77 |
+
|
| 78 |
+
Args:
|
| 79 |
+
example_batch: dataset after loading it from datasets library
|
| 80 |
+
Returns:
|
| 81 |
+
input_ids: A list of token ids representing the dialogue
|
| 82 |
+
attention_mask: List of indices specifying which tokens should be attended to by the model
|
| 83 |
+
labels: A list of token ids representing the summary
|
| 84 |
+
"""
|
| 85 |
+
try:
|
| 86 |
+
logger.info("Entered convert_examples_to_features method of DataTransformation class.")
|
| 87 |
+
input_encodings = self.tokenizer(example_batch['text'], max_length = 1024, truncation = True)
|
| 88 |
+
|
| 89 |
+
with self.tokenizer.as_target_tokenizer():
|
| 90 |
+
target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True)
|
| 91 |
+
|
| 92 |
+
return {
|
| 93 |
+
'input_ids': input_encodings['input_ids'],
|
| 94 |
+
'attention_mask': input_encodings['attention_mask'],
|
| 95 |
+
'labels': target_encodings['input_ids']
|
| 96 |
+
}
|
| 97 |
+
except Exception as e:
|
| 98 |
+
raise CustomException(e, sys) from e
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def convert(self):
|
| 102 |
+
data1 = os.path.join(self.config.data_path, self.config.ALL_REQUIRED_DATA[0])
|
| 103 |
+
data2 = os.path.join(self.config.data_path, self.config.ALL_REQUIRED_DATA[1])
|
| 104 |
+
|
| 105 |
+
dataset = self.convert_data_into_right_format([data1, data2])
|
| 106 |
+
dataset_pt = dataset.map(self.convert_examples_to_features, batched=True)
|
| 107 |
+
dataset_pt.save_to_disk(os.path.join(self.config.root_dir, "dataset"))
|
src/summarylm/components/data_validation.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
from summarylm.logging import logger
|
| 4 |
+
from summarylm.exception import CustomException
|
| 5 |
+
from summarylm.entity import DataValidationConfig
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class DataValidation:
|
| 9 |
+
"""
|
| 10 |
+
Class for validating if all data files exists in train, test, validation folders
|
| 11 |
+
|
| 12 |
+
Args:
|
| 13 |
+
config (DataValidationConfig): Contain all config for data validation
|
| 14 |
+
|
| 15 |
+
Returns:
|
| 16 |
+
validation_status (bool): true if data exists else false
|
| 17 |
+
"""
|
| 18 |
+
def __init__(self, config: DataValidationConfig):
|
| 19 |
+
self.config = config
|
| 20 |
+
|
| 21 |
+
def validate_all_files_exist(self) -> bool:
|
| 22 |
+
try:
|
| 23 |
+
logger.info("Entered validate_all_files_exist method of DataValidation class.")
|
| 24 |
+
validation_status = None
|
| 25 |
+
|
| 26 |
+
for data in self.config.ALL_REQUIRED_DATA:
|
| 27 |
+
all_files = os.listdir(os.path.join("artifacts", "data_ingestion", data))
|
| 28 |
+
|
| 29 |
+
for file in all_files:
|
| 30 |
+
if file not in self.config.ALL_REQUIRED_FILES:
|
| 31 |
+
validation_status = False
|
| 32 |
+
|
| 33 |
+
with open(self.config.STATUS_FILE, 'w') as f:
|
| 34 |
+
f.write(f"Validation status: {validation_status}")
|
| 35 |
+
else:
|
| 36 |
+
validation_status = True
|
| 37 |
+
|
| 38 |
+
with open(self.config.STATUS_FILE, 'w') as f:
|
| 39 |
+
f.write(f"Validation status: {validation_status}")
|
| 40 |
+
|
| 41 |
+
logger.info("Completed validate_all_files_exist method of DataValidation class.")
|
| 42 |
+
|
| 43 |
+
return validation_status
|
| 44 |
+
except Exception as e:
|
| 45 |
+
raise CustomException(e, sys) from e
|
src/summarylm/components/model_evaluation.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from summarylm.entity import ModelEvaluationConfig
|
| 2 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
| 3 |
+
from datasets import load_dataset, load_from_disk, load_metric
|
| 4 |
+
import torch
|
| 5 |
+
import pandas as pd
|
| 6 |
+
from tqdm import tqdm
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class ModelEvaluation:
|
| 10 |
+
def __init__(self, config: ModelEvaluationConfig):
|
| 11 |
+
self.config = config
|
| 12 |
+
|
| 13 |
+
def generate_batch_size_chunks(self, list_of_elements, batch_size):
|
| 14 |
+
"""
|
| 15 |
+
Split the dataset into smaller batches that we can process simultaneously
|
| 16 |
+
Yield successive batch-sized chunks from list_of_elements.
|
| 17 |
+
"""
|
| 18 |
+
for i in range(0, len(list_of_elements), batch_size):
|
| 19 |
+
yield list_of_elements[i : i + batch_size]
|
| 20 |
+
|
| 21 |
+
def calculate_metric_on_test_ds(self, dataset, metric, model, tokenizer, batch_size=16,
|
| 22 |
+
device="cuda" if torch.cuda.is_available() else "cpu",
|
| 23 |
+
column_text="article",
|
| 24 |
+
column_summary="highlights"):
|
| 25 |
+
article_batches = list(self.generate_batch_size_chunks(dataset[column_text], batch_size))
|
| 26 |
+
target_batches = list(self.generate_batch_size_chunks(dataset[column_summary], batch_size))
|
| 27 |
+
|
| 28 |
+
for article_batch, target_batch in tqdm(
|
| 29 |
+
zip(article_batches, target_batches), total=len(article_batches)):
|
| 30 |
+
|
| 31 |
+
inputs = tokenizer(article_batch, max_length=1024, truncation=True,
|
| 32 |
+
padding="max_length", return_tensors="pt")
|
| 33 |
+
|
| 34 |
+
summaries = model.generate(input_ids=inputs["input_ids"].to(device),
|
| 35 |
+
attention_mask=inputs["attention_mask"].to(device),
|
| 36 |
+
length_penalty=0.8, num_beams=8, max_length=128)
|
| 37 |
+
|
| 38 |
+
''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''
|
| 39 |
+
|
| 40 |
+
# Finally, we decode the generated texts,
|
| 41 |
+
# replace the token, and add the decoded texts with the references to the metric.
|
| 42 |
+
decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
|
| 43 |
+
clean_up_tokenization_spaces=True) for s in summaries]
|
| 44 |
+
|
| 45 |
+
decoded_summaries = [d.replace("", " ") for d in decoded_summaries]
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
metric.add_batch(predictions=decoded_summaries, references=target_batch)
|
| 49 |
+
|
| 50 |
+
# Finally compute and return the ROUGE scores.
|
| 51 |
+
score = metric.compute()
|
| 52 |
+
return score
|
| 53 |
+
|
| 54 |
+
def evaluation(self):
|
| 55 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 56 |
+
tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
|
| 57 |
+
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path).to(device)
|
| 58 |
+
|
| 59 |
+
# loading data
|
| 60 |
+
dataset_pt = load_from_disk(self.config.data_path)
|
| 61 |
+
|
| 62 |
+
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
|
| 63 |
+
|
| 64 |
+
rouge_metric = load_metric('rouge')
|
| 65 |
+
|
| 66 |
+
score = self.calculate_metric_on_test_ds(dataset_pt['test'][0:10], rouge_metric, model_pegasus, tokenizer, batch_size = 2, column_text = 'text', column_summary= 'summary')
|
| 67 |
+
rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )
|
| 68 |
+
df = pd.DataFrame(rouge_dict, index = ['pegasus'] )
|
| 69 |
+
df.to_csv(self.config.metric_file_name, index=False)
|
src/summarylm/components/model_trainer.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
from summarylm.entity import ModelTrainerConfig
|
| 4 |
+
from summarylm.logging import logger
|
| 5 |
+
from summarylm.exception import CustomException
|
| 6 |
+
from transformers import TrainingArguments, Trainer
|
| 7 |
+
from transformers import DataCollatorForSeq2Seq
|
| 8 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
| 9 |
+
from datasets import load_dataset, load_from_disk
|
| 10 |
+
import torch
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class ModelTrainer:
|
| 14 |
+
"""
|
| 15 |
+
Class for training model
|
| 16 |
+
|
| 17 |
+
Args:
|
| 18 |
+
config (ModelTrainerConfig): Contain all config for model training
|
| 19 |
+
"""
|
| 20 |
+
def __init__(self, config: ModelTrainerConfig):
|
| 21 |
+
self.config = config
|
| 22 |
+
|
| 23 |
+
def train(self):
|
| 24 |
+
"""
|
| 25 |
+
Method to train pegasus model
|
| 26 |
+
"""
|
| 27 |
+
logger.info("Entered train method of ModelTrainer class.")
|
| 28 |
+
try:
|
| 29 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 30 |
+
tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
|
| 31 |
+
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
|
| 32 |
+
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)
|
| 33 |
+
|
| 34 |
+
# loading the dataset
|
| 35 |
+
dataset_pt = load_from_disk(self.config.data_path)
|
| 36 |
+
|
| 37 |
+
trainer_args = TrainingArguments(
|
| 38 |
+
output_dir=self.config.root_dir,
|
| 39 |
+
num_train_epochs=self.config.num_train_epochs,
|
| 40 |
+
warmup_steps=self.config.warmup_steps,
|
| 41 |
+
per_device_train_batch_size=self.config.per_device_train_batch_size,
|
| 42 |
+
per_device_eval_batch_size=self.config.per_device_train_batch_size,
|
| 43 |
+
weight_decay=self.config.weight_decay,
|
| 44 |
+
logging_steps=self.config.logging_steps,
|
| 45 |
+
evaluation_strategy=self.config.evaluation_strategy,
|
| 46 |
+
eval_steps=self.config.eval_steps,
|
| 47 |
+
save_steps=1e6,
|
| 48 |
+
gradient_accumulation_steps=self.config.gradient_accumulation_steps,
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
trainer = Trainer(
|
| 52 |
+
model=model_pegasus,
|
| 53 |
+
args=trainer_args,
|
| 54 |
+
tokenizer=tokenizer,
|
| 55 |
+
data_collator=seq2seq_data_collator,
|
| 56 |
+
train_dataset=dataset_pt['train'],
|
| 57 |
+
eval_dataset=dataset_pt['validation']
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
trainer.train()
|
| 61 |
+
|
| 62 |
+
## Save model
|
| 63 |
+
model_pegasus.save_pretrained(os.path.join(self.config.root_dir,"pegasus-summary-lm"))
|
| 64 |
+
## Save tokenizer
|
| 65 |
+
tokenizer.save_pretrained(os.path.join(self.config.root_dir,"tokenizer"))
|
| 66 |
+
|
| 67 |
+
logger.info("Completed trian method of ModelTrainer class.")
|
| 68 |
+
except Exception as e:
|
| 69 |
+
raise CustomException(e, sys) from e
|
src/summarylm/config/__init__.py
ADDED
|
File without changes
|
src/summarylm/config/configuration.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from summarylm.constants import *
|
| 2 |
+
from summarylm.utils.common import read_yaml, create_directories
|
| 3 |
+
from summarylm.entity import (DataIngestionConfig, DataValidationConfig, DataTransformationConfig, ModelTrainerConfig, ModelEvaluationConfig)
|
| 4 |
+
|
| 5 |
+
class ConfigurationManager:
|
| 6 |
+
"""
|
| 7 |
+
Configuration Manager for Data Ingestion, Data Transformation
|
| 8 |
+
|
| 9 |
+
Args:
|
| 10 |
+
config_filepath (Path): Path to config yaml file
|
| 11 |
+
params_filepath (Path): Path to params yaml file
|
| 12 |
+
|
| 13 |
+
Retuns:
|
| 14 |
+
None
|
| 15 |
+
"""
|
| 16 |
+
def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH) -> None:
|
| 17 |
+
self.config = read_yaml(config_filepath)
|
| 18 |
+
self.params = read_yaml(params_filepath)
|
| 19 |
+
|
| 20 |
+
create_directories([self.config.artifacts_root])
|
| 21 |
+
|
| 22 |
+
def get_data_ingestion_config(self) -> DataIngestionConfig:
|
| 23 |
+
config = self.config.data_ingestion
|
| 24 |
+
|
| 25 |
+
create_directories([config.root_dir])
|
| 26 |
+
|
| 27 |
+
data_ingestion_config = DataIngestionConfig(
|
| 28 |
+
root_dir=config.root_dir,
|
| 29 |
+
ALL_HUGGINGFACE_DATA=config.ALL_HUGGINGFACE_DATA,
|
| 30 |
+
LOCAL_DATA_FILE = config.LOCAL_DATA_FILE,
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
return data_ingestion_config
|
| 35 |
+
|
| 36 |
+
def get_data_validation_config(self) -> DataValidationConfig:
|
| 37 |
+
config = self.config.data_validation
|
| 38 |
+
|
| 39 |
+
create_directories([config.root_dir])
|
| 40 |
+
|
| 41 |
+
data_validation_config = DataValidationConfig(
|
| 42 |
+
root_dir=config.root_dir,
|
| 43 |
+
STATUS_FILE=config.STATUS_FILE,
|
| 44 |
+
ALL_REQUIRED_DATA=config.ALL_REQUIRED_DATA,
|
| 45 |
+
ALL_REQUIRED_FILES=config.ALL_REQUIRED_FILES,
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
return data_validation_config
|
| 49 |
+
|
| 50 |
+
def get_data_transformation_config(self) -> DataTransformationConfig:
|
| 51 |
+
config = self.config.data_transformation
|
| 52 |
+
|
| 53 |
+
create_directories([config.root_dir])
|
| 54 |
+
|
| 55 |
+
data_transformation_config = DataTransformationConfig(
|
| 56 |
+
root_dir=config.root_dir,
|
| 57 |
+
data_path=config.data_path,
|
| 58 |
+
ALL_REQUIRED_DATA=config.ALL_REQUIRED_DATA,
|
| 59 |
+
tokenizer_name=config.tokenizer_name
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
return data_transformation_config
|
| 63 |
+
|
| 64 |
+
def get_model_trainer_config(self) -> ModelTrainerConfig:
|
| 65 |
+
config = self.config.model_trainer
|
| 66 |
+
params = self.params.TrainingArguments
|
| 67 |
+
|
| 68 |
+
create_directories([config.root_dir])
|
| 69 |
+
|
| 70 |
+
model_trainer_config = ModelTrainerConfig(
|
| 71 |
+
root_dir=config.root_dir,
|
| 72 |
+
data_path=config.data_path,
|
| 73 |
+
model_ckpt=config.model_ckpt,
|
| 74 |
+
num_train_epochs=params.num_train_epochs,
|
| 75 |
+
warmup_steps=params.warmup_steps,
|
| 76 |
+
per_device_train_batch_size=params.per_device_train_batch_size,
|
| 77 |
+
weight_decay=params.weight_decay,
|
| 78 |
+
logging_steps=params.logging_steps,
|
| 79 |
+
evaluation_strategy=params.evaluation_strategy,
|
| 80 |
+
eval_steps=params.eval_steps,
|
| 81 |
+
save_steps=params.save_steps,
|
| 82 |
+
gradient_accumulation_steps=params.gradient_accumulation_steps,
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
return model_trainer_config
|
| 86 |
+
|
| 87 |
+
def get_model_evaluation_config(self) -> ModelEvaluationConfig:
|
| 88 |
+
config = self.config.model_evaluation
|
| 89 |
+
|
| 90 |
+
create_directories([config.root_dir])
|
| 91 |
+
|
| 92 |
+
model_evaluation_config = ModelEvaluationConfig(
|
| 93 |
+
root_dir=config.root_dir,
|
| 94 |
+
data_path=config.data_path,
|
| 95 |
+
model_path=config.model_path,
|
| 96 |
+
tokenizer_path=config.tokenizer_path,
|
| 97 |
+
metric_file_name=config.metric_file_name,
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
return model_evaluation_config
|
src/summarylm/config/gcloud_syncer.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
class GCloudSync:
|
| 4 |
+
|
| 5 |
+
def sync_folder_to_gcloud(self, gcp_bucket_url, filepath, filename):
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
Function to sync files from local machine to Google Cloud Storage
|
| 9 |
+
|
| 10 |
+
Args:
|
| 11 |
+
gcp_bucket_url (str): Google Cloud Storage bucket URL
|
| 12 |
+
filepath (str): Local file path
|
| 13 |
+
filename (str): Local file name
|
| 14 |
+
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
command = f"gsutil cp {filename}/{filepath} gs://{gcp_bucket_url}"
|
| 18 |
+
|
| 19 |
+
os.system(command)
|
| 20 |
+
|
| 21 |
+
def sync_folder_from_gcloud(self, gcp_bucket_url, filename, destination):
|
| 22 |
+
|
| 23 |
+
"""
|
| 24 |
+
Function to sync the folder from the gclooud to local machine
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
gcp_bucket_url (str): Google Cloud Storage bucket URL
|
| 28 |
+
filename (str): Local file name
|
| 29 |
+
destination (str): Local file path
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
command = f"gsutil cp gs://{gcp_bucket_url}/{filename} {destination}/{filename}"
|
| 33 |
+
|
| 34 |
+
os.system(command)
|
src/summarylm/constants/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
|
| 3 |
+
CONFIG_FILE_PATH = Path("config/config.yaml")
|
| 4 |
+
PARAMS_FILE_PATH = Path("params.yaml")
|
src/summarylm/entity/__init__.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
@dataclass(frozen=True)
|
| 5 |
+
class DataIngestionConfig:
|
| 6 |
+
root_dir: Path
|
| 7 |
+
ALL_HUGGINGFACE_DATA: list
|
| 8 |
+
LOCAL_DATA_FILE: list
|
| 9 |
+
|
| 10 |
+
@dataclass(frozen=True)
|
| 11 |
+
class DataValidationConfig:
|
| 12 |
+
root_dir: Path
|
| 13 |
+
STATUS_FILE: str
|
| 14 |
+
ALL_REQUIRED_DATA: list
|
| 15 |
+
ALL_REQUIRED_FILES: list
|
| 16 |
+
|
| 17 |
+
@dataclass(frozen=True)
|
| 18 |
+
class DataTransformationConfig:
|
| 19 |
+
root_dir: Path
|
| 20 |
+
data_path: Path
|
| 21 |
+
ALL_REQUIRED_DATA: Path
|
| 22 |
+
tokenizer_name: Path
|
| 23 |
+
|
| 24 |
+
@dataclass(frozen=True)
|
| 25 |
+
class ModelTrainerConfig:
|
| 26 |
+
root_dir: Path
|
| 27 |
+
data_path: Path
|
| 28 |
+
model_ckpt: Path
|
| 29 |
+
num_train_epochs: int
|
| 30 |
+
warmup_steps: int
|
| 31 |
+
per_device_train_batch_size: int
|
| 32 |
+
weight_decay: float
|
| 33 |
+
logging_steps: int
|
| 34 |
+
evaluation_strategy: str
|
| 35 |
+
eval_steps: int
|
| 36 |
+
save_steps: float
|
| 37 |
+
gradient_accumulation_steps: int
|
| 38 |
+
|
| 39 |
+
@dataclass(frozen=True)
|
| 40 |
+
class ModelEvaluationConfig:
|
| 41 |
+
root_dir: Path
|
| 42 |
+
data_path: Path
|
| 43 |
+
model_path: Path
|
| 44 |
+
tokenizer_path: Path
|
| 45 |
+
metric_file_name: Path
|
src/summarylm/exception/__init__.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
from summarylm.logging import logger
|
| 4 |
+
|
| 5 |
+
def error_message_detail(error, error_detail):
|
| 6 |
+
"""
|
| 7 |
+
Retruns the error message and error details and logs the error
|
| 8 |
+
|
| 9 |
+
Args:
|
| 10 |
+
error: error message
|
| 11 |
+
error_detail: error details
|
| 12 |
+
|
| 13 |
+
Returns:
|
| 14 |
+
error_message: error message
|
| 15 |
+
"""
|
| 16 |
+
_, _, exe_tb = error_detail.exc_info()
|
| 17 |
+
file_name = exe_tb.tb_frame.f_code.co_filename
|
| 18 |
+
line_number = exe_tb.tb_lineno
|
| 19 |
+
error_message = "Error occured in file called [{0}] line number: [{1}] error message: [{2}]".format(
|
| 20 |
+
file_name, line_number, str(error)
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
logger.info(error_message)
|
| 24 |
+
|
| 25 |
+
return error_message
|
| 26 |
+
|
| 27 |
+
class CustomException(Exception):
|
| 28 |
+
def __init__(self, error_message, error_detail):
|
| 29 |
+
super().__init__(error_message)
|
| 30 |
+
self.error_message = error_message_detail(error_message, error_detail=error_detail)
|
| 31 |
+
|
| 32 |
+
def __str__(self):
|
| 33 |
+
return self.error_message
|
| 34 |
+
|
src/summarylm/logging/__init__.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import os
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
|
| 5 |
+
"""
|
| 6 |
+
Logging Every error and in logging file that is in the logs directory.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
LOG_FILE = f"running_logs.log"
|
| 10 |
+
logs_path = os.path.join(os.getcwd(), "logs")
|
| 11 |
+
os.makedirs(logs_path, exist_ok=True)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
LOG_FILE_PATH = os.path.join(logs_path, LOG_FILE)
|
| 15 |
+
|
| 16 |
+
logging.basicConfig(
|
| 17 |
+
filename=LOG_FILE_PATH,
|
| 18 |
+
format="[ %(asctime)s ] %(lineno)d %(name)s - %(levelname)s - %(message)s",
|
| 19 |
+
level=logging.INFO,
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
logger = logging.getLogger("textSummarizerLogger")
|
src/summarylm/pipeline/__init__.py
ADDED
|
File without changes
|
src/summarylm/pipeline/data_ingestion.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
from summarylm.config.configuration import ConfigurationManager
|
| 3 |
+
from summarylm.components.data_ingestion import DataIngestion
|
| 4 |
+
from summarylm.logging import logger
|
| 5 |
+
from summarylm.exception import CustomException
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class DataIngestionPipeline:
|
| 9 |
+
"""
|
| 10 |
+
Pipeline for data ingestion
|
| 11 |
+
"""
|
| 12 |
+
def __init__(self) -> None:
|
| 13 |
+
pass
|
| 14 |
+
|
| 15 |
+
def main(self):
|
| 16 |
+
try:
|
| 17 |
+
config = ConfigurationManager()
|
| 18 |
+
data_ingestion_config = config.get_data_ingestion_config()
|
| 19 |
+
data_ingestion = DataIngestion(config=data_ingestion_config)
|
| 20 |
+
data_ingestion.download_data()
|
| 21 |
+
except Exception as e:
|
| 22 |
+
raise CustomException(e, sys) from e
|
src/summarylm/pipeline/data_transformation.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
from summarylm.config.configuration import ConfigurationManager
|
| 3 |
+
from summarylm.components.data_transformation import DataTransformation
|
| 4 |
+
from summarylm.logging import logger
|
| 5 |
+
from summarylm.exception import CustomException
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class DataTransformationPipeline:
|
| 9 |
+
"""
|
| 10 |
+
Pipeline for data transformation to convert data into right format
|
| 11 |
+
"""
|
| 12 |
+
def __init__(self) -> None:
|
| 13 |
+
pass
|
| 14 |
+
|
| 15 |
+
def main(self):
|
| 16 |
+
try:
|
| 17 |
+
config = ConfigurationManager()
|
| 18 |
+
data_transformation_config = config.get_data_transformation_config()
|
| 19 |
+
data_transformation = DataTransformation(config=data_transformation_config)
|
| 20 |
+
data_transformation.convert()
|
| 21 |
+
except Exception as e:
|
| 22 |
+
raise CustomException(e, sys) from e
|
src/summarylm/pipeline/data_validation.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
from summarylm.config.configuration import ConfigurationManager
|
| 3 |
+
from summarylm.components.data_validation import DataValidation
|
| 4 |
+
from summarylm.logging import logger
|
| 5 |
+
from summarylm.exception import CustomException
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class DataValidationPipeline:
|
| 9 |
+
"""
|
| 10 |
+
Pipeline for validating if data exists
|
| 11 |
+
"""
|
| 12 |
+
def __init__(self) -> None:
|
| 13 |
+
pass
|
| 14 |
+
|
| 15 |
+
def main(self):
|
| 16 |
+
try:
|
| 17 |
+
config = ConfigurationManager()
|
| 18 |
+
data_validation_config = config.get_data_validation_config()
|
| 19 |
+
data_validation = DataValidation(config=data_validation_config)
|
| 20 |
+
data_validation.validate_all_files_exist()
|
| 21 |
+
except Exception as e:
|
| 22 |
+
raise CustomException(e, sys) from e
|
src/summarylm/pipeline/model_evaluation.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
from summarylm.config.configuration import ConfigurationManager
|
| 3 |
+
from summarylm.components.model_evaluation import ModelEvaluation
|
| 4 |
+
from summarylm.logging import logger
|
| 5 |
+
from summarylm.exception import CustomException
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class ModelEvaluationPipeline:
|
| 9 |
+
"""
|
| 10 |
+
Pipeline for pegasus model evaluation
|
| 11 |
+
"""
|
| 12 |
+
def __init__(self) -> None:
|
| 13 |
+
pass
|
| 14 |
+
|
| 15 |
+
def main(self):
|
| 16 |
+
try:
|
| 17 |
+
config = ConfigurationManager()
|
| 18 |
+
model_evaluation_config = config.get_model_evaluation_config()
|
| 19 |
+
model_evaluation_config = ModelEvaluation(config=model_evaluation_config)
|
| 20 |
+
model_evaluation_config.evaluation()
|
| 21 |
+
except Exception as e:
|
| 22 |
+
raise CustomException(e, sys) from e
|
src/summarylm/pipeline/model_trainer.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
from summarylm.config.configuration import ConfigurationManager
|
| 3 |
+
from summarylm.components.model_trainer import ModelTrainer
|
| 4 |
+
from summarylm.logging import logger
|
| 5 |
+
from summarylm.exception import CustomException
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class ModelTrainerPipeline:
|
| 9 |
+
"""
|
| 10 |
+
Pipeline for training pegasus model
|
| 11 |
+
"""
|
| 12 |
+
def __init__(self) -> None:
|
| 13 |
+
pass
|
| 14 |
+
|
| 15 |
+
def main(self):
|
| 16 |
+
try:
|
| 17 |
+
config = ConfigurationManager()
|
| 18 |
+
model_trainer_config = config.get_model_trainer_config()
|
| 19 |
+
model_trainer_config = ModelTrainer(config=model_trainer_config)
|
| 20 |
+
model_trainer_config.train()
|
| 21 |
+
except Exception as e:
|
| 22 |
+
raise CustomException(e, sys) from e
|
src/summarylm/pipeline/prediction.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from summarylm.config.configuration import ConfigurationManager
|
| 2 |
+
from transformers import AutoTokenizer
|
| 3 |
+
from transformers import pipeline
|
| 4 |
+
|
| 5 |
+
class PredictionPipeline:
|
| 6 |
+
def __init__(self):
|
| 7 |
+
self.config = ConfigurationManager().get_model_evaluation_config()
|
| 8 |
+
|
| 9 |
+
def predict(self, text, max_length):
|
| 10 |
+
tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
|
| 11 |
+
gen_kwargs = {"length_penalty": 0.8, "num_beams": 8, "max_length": max_length}
|
| 12 |
+
|
| 13 |
+
pipe = pipeline("summarization", model=self.config.model_path, tokenizer=tokenizer)
|
| 14 |
+
|
| 15 |
+
output = pipe(text, **gen_kwargs)[0]["summary_text"]
|
| 16 |
+
|
| 17 |
+
return output
|
src/summarylm/utils/__init__.py
ADDED
|
File without changes
|
src/summarylm/utils/common.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
from box.exceptions import BoxValueError
|
| 4 |
+
from summarylm.exception import CustomException
|
| 5 |
+
import yaml
|
| 6 |
+
from summarylm.logging import logger
|
| 7 |
+
from ensure import ensure_annotations
|
| 8 |
+
from box import ConfigBox
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import Any
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@ensure_annotations
|
| 14 |
+
def read_yaml(path_to_yaml: Path) -> ConfigBox:
|
| 15 |
+
"""
|
| 16 |
+
Reading yam file and returns
|
| 17 |
+
|
| 18 |
+
Args:
|
| 19 |
+
path_to_yaml (str): path like input
|
| 20 |
+
|
| 21 |
+
Raises:
|
| 22 |
+
ValueError: if yaml file is empty
|
| 23 |
+
|
| 24 |
+
Returns:
|
| 25 |
+
ConfigBox: ConfigBox type
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
try:
|
| 29 |
+
with open(path_to_yaml) as yaml_file:
|
| 30 |
+
content = yaml.safe_load(yaml_file)
|
| 31 |
+
logger.info(f"Yaml file: {path_to_yaml} loaded successfully")
|
| 32 |
+
return ConfigBox(content)
|
| 33 |
+
except BoxValueError:
|
| 34 |
+
raise ValueError("yaml file is empty")
|
| 35 |
+
except Exception as e:
|
| 36 |
+
raise CustomException(e, sys) from e
|
| 37 |
+
|
| 38 |
+
@ensure_annotations
|
| 39 |
+
def create_directories(path_to_directories: list, verbose=True):
|
| 40 |
+
"""
|
| 41 |
+
Create list of directories
|
| 42 |
+
|
| 43 |
+
Args:
|
| 44 |
+
path_to_directories (list): list of path of directories
|
| 45 |
+
ignore_log (bool, optional): ignore if multiple dirs is to be created. Defaults to False
|
| 46 |
+
"""
|
| 47 |
+
|
| 48 |
+
for path in path_to_directories:
|
| 49 |
+
os.makedirs(path, exist_ok=True)
|
| 50 |
+
if verbose:
|
| 51 |
+
logger.info(f"Directory created successfully at: {path}")
|
| 52 |
+
|
| 53 |
+
@ensure_annotations
|
| 54 |
+
def get_size(path: Path) -> str:
|
| 55 |
+
"""
|
| 56 |
+
Get size in KB
|
| 57 |
+
|
| 58 |
+
Args:
|
| 59 |
+
path (Path): path of the file
|
| 60 |
+
|
| 61 |
+
Returns:
|
| 62 |
+
str: size in KB
|
| 63 |
+
"""
|
| 64 |
+
size_in_kb = round(os.path.getsize(path)/1024)
|
| 65 |
+
return f"~{size_in_kb} KB"
|
template.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
import logging
|
| 4 |
+
|
| 5 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
| 6 |
+
|
| 7 |
+
project_name = "summarylm"
|
| 8 |
+
|
| 9 |
+
list_of_file = [
|
| 10 |
+
".github/workflows/.gitkeep",
|
| 11 |
+
f"src/{project_name}/__init__.py",
|
| 12 |
+
f"src/{project_name}/components/__init__.py",
|
| 13 |
+
f"src/{project_name}/components/data_ingestion.py",
|
| 14 |
+
f"src/{project_name}/components/data_transformation.py",
|
| 15 |
+
f"src/{project_name}/components/data_validation.py",
|
| 16 |
+
f"src/{project_name}/components/model_evaluation.py",
|
| 17 |
+
f"src/{project_name}/components/model_trainer.py",
|
| 18 |
+
f"src/{project_name}/utils/__init__.py",
|
| 19 |
+
f"src/{project_name}/utils/common.py",
|
| 20 |
+
f"src/{project_name}/logging/__init__.py",
|
| 21 |
+
f"src/{project_name}/exception/__init__.py",
|
| 22 |
+
f"src/{project_name}/config/__init__.py",
|
| 23 |
+
f"src/{project_name}/config/configuration.py",
|
| 24 |
+
f"src/{project_name}/config/gcloud_syncer.py",
|
| 25 |
+
f"src/{project_name}/pipeline/__init__.py",
|
| 26 |
+
f"src/{project_name}/pipeline/data_ingestion.py",
|
| 27 |
+
f"src/{project_name}/entity/__init__.py",
|
| 28 |
+
f"src/{project_name}/constants/__init__.py",
|
| 29 |
+
"config/config.yaml",
|
| 30 |
+
"params.yaml",
|
| 31 |
+
"app.py",
|
| 32 |
+
"main.py",
|
| 33 |
+
"Dockerfile",
|
| 34 |
+
"requirements.txt",
|
| 35 |
+
"setup.py",
|
| 36 |
+
"research/experiment.ipynb",
|
| 37 |
+
]
|
| 38 |
+
|
| 39 |
+
for filepath in list_of_file:
|
| 40 |
+
filepath = Path(filepath)
|
| 41 |
+
filedir, filename = os.path.split(filepath)
|
| 42 |
+
|
| 43 |
+
if filedir != "":
|
| 44 |
+
os.makedirs(filedir, exist_ok=True)
|
| 45 |
+
logging.info(f"Creating directory: {filedir} for the file {filename}")
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
if (not os.path.exists(filepath)) or (os.path.getsize(filepath) == 0):
|
| 49 |
+
with open(filepath, 'w') as f:
|
| 50 |
+
pass
|
| 51 |
+
logging.info(f"Creating empty file: {filepath}")
|
| 52 |
+
else:
|
| 53 |
+
logging.info(f"{filename} is already exists")
|