satyam998 commited on
Commit
95576a3
·
0 Parent(s):

Initial commit

Browse files
Files changed (41) hide show
  1. .github/workflows/main.yml +34 -0
  2. .gitignore +162 -0
  3. Dockerfile +17 -0
  4. LICENSE +201 -0
  5. README.md +11 -0
  6. app.py +36 -0
  7. config/config.yaml +34 -0
  8. main.py +66 -0
  9. params.yaml +10 -0
  10. requirements.txt +23 -0
  11. research/data_ingestion.ipynb +183 -0
  12. research/data_transformation.ipynb +234 -0
  13. research/data_validation.ipynb +197 -0
  14. research/experiment.ipynb +74 -0
  15. research/model_evaluation.ipynb +265 -0
  16. research/model_trainer.ipynb +239 -0
  17. setup.py +22 -0
  18. src/summarylm/__init__.py +0 -0
  19. src/summarylm/components/__init__.py +0 -0
  20. src/summarylm/components/data_ingestion.py +39 -0
  21. src/summarylm/components/data_transformation.py +107 -0
  22. src/summarylm/components/data_validation.py +45 -0
  23. src/summarylm/components/model_evaluation.py +69 -0
  24. src/summarylm/components/model_trainer.py +69 -0
  25. src/summarylm/config/__init__.py +0 -0
  26. src/summarylm/config/configuration.py +100 -0
  27. src/summarylm/config/gcloud_syncer.py +34 -0
  28. src/summarylm/constants/__init__.py +4 -0
  29. src/summarylm/entity/__init__.py +45 -0
  30. src/summarylm/exception/__init__.py +34 -0
  31. src/summarylm/logging/__init__.py +22 -0
  32. src/summarylm/pipeline/__init__.py +0 -0
  33. src/summarylm/pipeline/data_ingestion.py +22 -0
  34. src/summarylm/pipeline/data_transformation.py +22 -0
  35. src/summarylm/pipeline/data_validation.py +22 -0
  36. src/summarylm/pipeline/model_evaluation.py +22 -0
  37. src/summarylm/pipeline/model_trainer.py +22 -0
  38. src/summarylm/pipeline/prediction.py +17 -0
  39. src/summarylm/utils/__init__.py +0 -0
  40. src/summarylm/utils/common.py +65 -0
  41. template.py +53 -0
.github/workflows/main.yml ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub
2
+ on:
3
+ push:
4
+ branches: [main]
5
+
6
+ # To run this workflow manually from the Actions tab
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ sync-to-hub:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v3
14
+ with:
15
+ fetch-depth: 0
16
+ lfs: true
17
+
18
+ - name: Set up Git user
19
+ run: |
20
+ git config --global user.email "[email protected]"
21
+ git config --global user.name "satyam998"
22
+
23
+ - name: Create a new branch
24
+ run: |
25
+ git checkout --orphan temp
26
+ git add -A
27
+ git commit -m "Initial commit"
28
+ git branch -D main
29
+ git branch -m main
30
+
31
+ - name: Force push to hub
32
+ env:
33
+ HF: ${{ secrets.HG }}
34
+ run: git push --force https://satyam998:[email protected]/spaces/satyam998/pegasus-summary-lm main
.gitignore ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+ artifacts/
54
+
55
+ # Translations
56
+ *.mo
57
+ *.pot
58
+
59
+ # Django stuff:
60
+ *.log
61
+ local_settings.py
62
+ db.sqlite3
63
+ db.sqlite3-journal
64
+
65
+ # Flask stuff:
66
+ instance/
67
+ .webassets-cache
68
+
69
+ # Scrapy stuff:
70
+ .scrapy
71
+
72
+ # Sphinx documentation
73
+ docs/_build/
74
+
75
+ # PyBuilder
76
+ .pybuilder/
77
+ target/
78
+
79
+ # Jupyter Notebook
80
+ .ipynb_checkpoints
81
+
82
+ # IPython
83
+ profile_default/
84
+ ipython_config.py
85
+
86
+ # pyenv
87
+ # For a library or package, you might want to ignore these files since the code is
88
+ # intended to run in multiple environments; otherwise, check them in:
89
+ # .python-version
90
+
91
+ # pipenv
92
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
93
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
94
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
95
+ # install all needed dependencies.
96
+ #Pipfile.lock
97
+
98
+ # poetry
99
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
100
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
101
+ # commonly ignored for libraries.
102
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
103
+ #poetry.lock
104
+
105
+ # pdm
106
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
107
+ #pdm.lock
108
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
109
+ # in version control.
110
+ # https://pdm.fming.dev/#use-with-ide
111
+ .pdm.toml
112
+
113
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
114
+ __pypackages__/
115
+
116
+ # Celery stuff
117
+ celerybeat-schedule
118
+ celerybeat.pid
119
+
120
+ # SageMath parsed files
121
+ *.sage.py
122
+
123
+ # Environments
124
+ .env
125
+ .venv
126
+ env/
127
+ venv/
128
+ ENV/
129
+ env.bak/
130
+ venv.bak/
131
+
132
+ # Spyder project settings
133
+ .spyderproject
134
+ .spyproject
135
+
136
+ # Rope project settings
137
+ .ropeproject
138
+
139
+ # mkdocs documentation
140
+ /site
141
+
142
+ # mypy
143
+ .mypy_cache/
144
+ .dmypy.json
145
+ dmypy.json
146
+
147
+ # Pyre type checker
148
+ .pyre/
149
+
150
+ # pytype static type analyzer
151
+ .pytype/
152
+
153
+ # Cython debug symbols
154
+ cython_debug/
155
+
156
+ # PyCharm
157
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
158
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
159
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
160
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
161
+ #.idea/
162
+ "artifacts/"
Dockerfile ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ RUN useradd -m -u 1000 user
4
+
5
+ WORKDIR /app
6
+
7
+ COPY --chown=user . /app
8
+
9
+ RUN pip install -r requirements.txt
10
+ RUN pip install --upgrade accelerate
11
+ RUN pip uninstall -y transformers accelerate
12
+ RUN pip install transformers accelerate
13
+
14
+ RUN mkdir -p /app/logs
15
+ RUN chmod 777 /app/logs
16
+
17
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Pegasus Summary Lm
3
+ emoji: 🔥
4
+ colorFrom: green
5
+ colorTo: pink
6
+ sdk: docker
7
+ pinned: false
8
+ license: apache-2.0
9
+ ---
10
+
11
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ import uvicorn
3
+ import sys
4
+ import os
5
+ from fastapi.templating import Jinja2Templates
6
+ from starlette.responses import RedirectResponse
7
+ from fastapi.responses import Response
8
+ from summarylm.pipeline.prediction import PredictionPipeline
9
+ from summarylm.exception import CustomException
10
+
11
+ text:str = "What is Text Summarization?"
12
+
13
+ app = FastAPI()
14
+
15
+ @app.get("/", tags=["authentication"])
16
+ async def index():
17
+ return RedirectResponse(url='/docs')
18
+
19
+ @app.get("/train")
20
+ async def training():
21
+ try:
22
+ os.system("python main.py")
23
+ return Response("Training Successful!!")
24
+ except Exception as e:
25
+ return Response(f"Error Occurred! {e}")
26
+
27
+ @app.post("/predict")
28
+ async def predict_route(text, max_length: int = 128):
29
+ try:
30
+ print(type(max_length))
31
+ obj = PredictionPipeline()
32
+ text = obj.predict(text, max_length)
33
+ return text
34
+ except Exception as e:
35
+ raise CustomException(e, sys) from e
36
+
config/config.yaml ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ artifacts_root: artifacts
2
+
3
+ data_ingestion:
4
+ root_dir: artifacts/data_ingestion
5
+ ALL_HUGGINGFACE_DATA: ["d0rj/wikisum", "multi_news"]
6
+ LOCAL_DATA_FILE: ["artifacts/data_ingestion/wikisum", "artifacts/data_ingestion/multi_news"]
7
+
8
+
9
+ data_validation:
10
+ root_dir: artifacts/data_validation
11
+ STATUS_FILE: artifacts/data_validation/status.txt
12
+ ALL_REQUIRED_DATA: ["wikisum", "multi_news"]
13
+ ALL_REQUIRED_FILES: ["train", "test", "validation"]
14
+
15
+
16
+ data_transformation:
17
+ root_dir: artifacts/data_transformation
18
+ data_path: artifacts/data_ingestion/
19
+ ALL_REQUIRED_DATA: ["wikisum", "multi_news"]
20
+ tokenizer_name: google/pegasus-cnn_dailymail
21
+
22
+
23
+ model_trainer:
24
+ root_dir: artifacts/model_trainer
25
+ data_path: artifacts/data_transformation/dataset
26
+ model_ckpt: google/pegasus-cnn_dailymail
27
+
28
+
29
+ model_evaluation:
30
+ root_dir: artifacts/model_evaluation
31
+ data_path: artifacts/data_transformation/dataset
32
+ model_path: artifacts/model_trainer/pegasus-summary-lm
33
+ tokenizer_path: artifacts/model_trainer/tokenizer
34
+ metric_file_name: artifacts/model_evaluation/metrics.csv
main.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from summarylm.pipeline.data_ingestion import DataIngestionPipeline
3
+ from summarylm.pipeline.data_validation import DataValidationPipeline
4
+ from summarylm.pipeline.data_transformation import DataTransformationPipeline
5
+ from summarylm.pipeline.model_trainer import ModelTrainerPipeline
6
+ from summarylm.pipeline.model_evaluation import ModelEvaluationPipeline
7
+ from summarylm.logging import logger
8
+ from summarylm.exception import CustomException
9
+
10
+ # data ingestion
11
+ STAGE_NAME = "Data Ingestion"
12
+
13
+ try:
14
+ logger.info(f"Starting {STAGE_NAME} stage...")
15
+ data_ingestion = DataIngestionPipeline()
16
+ data_ingestion.main()
17
+ logger.info(f"Completed {STAGE_NAME} stage...")
18
+ except Exception as e:
19
+ raise CustomException(e, sys) from e
20
+
21
+
22
+ # data validation
23
+ STAGE_NAME = "Data Validation"
24
+
25
+ try:
26
+ logger.info(f"Starting {STAGE_NAME} stage...")
27
+ data_validation = DataValidationPipeline()
28
+ data_validation.main()
29
+ logger.info(f"Completed {STAGE_NAME} stage...")
30
+ except Exception as e:
31
+ raise CustomException(e, sys) from e
32
+
33
+
34
+ # data transformation
35
+ STAGE_NAME = "Data Transformation"
36
+
37
+ try:
38
+ logger.info(f"Starting {STAGE_NAME} stage...")
39
+ data_transformation= DataTransformationPipeline()
40
+ data_transformation.main()
41
+ logger.info(f"Completed {STAGE_NAME} stage...")
42
+ except Exception as e:
43
+ raise CustomException(e, sys) from e
44
+
45
+
46
+ # model trainer
47
+ STAGE_NAME = "Model Trainer"
48
+
49
+ try:
50
+ logger.info(f"Starting {STAGE_NAME} stage...")
51
+ model_trainer= ModelTrainerPipeline()
52
+ model_trainer.main()
53
+ logger.info(f"Completed {STAGE_NAME} stage...")
54
+ except Exception as e:
55
+ raise CustomException(e, sys) from e
56
+
57
+ # model evaluation
58
+ STAGE_NAME = "Model Evaluation"
59
+
60
+ try:
61
+ logger.info(f"Starting {STAGE_NAME} stage...")
62
+ model_trainer= ModelEvaluationPipeline()
63
+ model_trainer.main()
64
+ logger.info(f"Completed {STAGE_NAME} stage...")
65
+ except Exception as e:
66
+ raise CustomException(e, sys) from e
params.yaml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ TrainingArguments:
2
+ num_train_epochs: 1
3
+ warmup_steps: 500
4
+ per_device_train_batch_size: 1
5
+ weight_decay: 0.01
6
+ logging_steps: 10
7
+ evaluation_strategy: steps
8
+ eval_steps: 500
9
+ save_steps: 1e6
10
+ gradient_accumulation_steps: 16
requirements.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ transformers
2
+ transformers[sentencepiece]
3
+ transformers[torch]
4
+ datasets
5
+ sacrebleu
6
+ rouge_score
7
+ py7zr
8
+ pandas
9
+ nltk
10
+ tqdm
11
+ PyYAML
12
+ matplotlib
13
+ torch
14
+ notebook
15
+ boto3
16
+ mypy-boto3-s3
17
+ python-box==7.1.1
18
+ ensure==1.0.4
19
+ fastapi==0.78.0
20
+ uvicorn==0.29.0
21
+ Jinja2==3.1.4
22
+ google-cloud-storage
23
+ -e .
research/data_ingestion.ipynb ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os\n",
10
+ "import zipfile\n",
11
+ "from dataclasses import dataclass\n",
12
+ "from pathlib import Path\n",
13
+ "\n",
14
+ "from summarylm.logging import logger\n",
15
+ "from summarylm.constants import *\n",
16
+ "from summarylm.utils.common import read_yaml, create_directories, get_size"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": 2,
22
+ "metadata": {},
23
+ "outputs": [],
24
+ "source": [
25
+ "os.chdir(\"../\")"
26
+ ]
27
+ },
28
+ {
29
+ "cell_type": "code",
30
+ "execution_count": 5,
31
+ "metadata": {},
32
+ "outputs": [],
33
+ "source": [
34
+ "@dataclass(frozen=True)\n",
35
+ "class DataIngestionConfig:\n",
36
+ " root_dir: Path\n",
37
+ " ALL_HUGGINGFACE_DATA: list\n",
38
+ " LOCAL_DATA_FILE: list"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": 6,
44
+ "metadata": {},
45
+ "outputs": [],
46
+ "source": [
47
+ "class ConfigurationManager:\n",
48
+ " def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH) -> None:\n",
49
+ " self.config = read_yaml(config_filepath)\n",
50
+ " self.params = read_yaml(params_filepath)\n",
51
+ "\n",
52
+ " create_directories([self.config.artifacts_root])\n",
53
+ "\n",
54
+ " def get_data_ingestion_config(self) -> DataIngestionConfig:\n",
55
+ " config = self.config.data_ingestion\n",
56
+ "\n",
57
+ " create_directories([config.root_dir])\n",
58
+ "\n",
59
+ " data_ingestion_config = DataIngestionConfig(\n",
60
+ " root_dir=config.root_dir,\n",
61
+ " ALL_HUGGINGFACE_DATA=config.ALL_HUGGINGFACE_DATA,\n",
62
+ " LOCAL_DATA_FILE = config.LOCAL_DATA_FILE,\n",
63
+ " )\n",
64
+ "\n",
65
+ " return data_ingestion_config"
66
+ ]
67
+ },
68
+ {
69
+ "cell_type": "code",
70
+ "execution_count": 14,
71
+ "metadata": {},
72
+ "outputs": [],
73
+ "source": [
74
+ "from datasets import load_dataset\n",
75
+ "\n",
76
+ "class DataIngestion:\n",
77
+ " def __init__(self, config: DataIngestionConfig):\n",
78
+ " self.config = config\n",
79
+ "\n",
80
+ " def download_data(self):\n",
81
+ " for i in range(len(self.config.LOCAL_DATA_FILE)):\n",
82
+ " if not os.path.exists(self.config.LOCAL_DATA_FILE[i]):\n",
83
+ " dataset = load_dataset(self.config.ALL_HUGGINGFACE_DATA[i])\n",
84
+ " dataset.save_to_disk(self.config.LOCAL_DATA_FILE[i])\n",
85
+ " logger.info(f\"{self.config.ALL_HUGGINGFACE_DATA[i]} downloaded!\")\n",
86
+ " else:\n",
87
+ " logger.info(f\"File already exists of size: {get_size(Path(self.config.LOCAL_DATA_FILE[i]))}\")"
88
+ ]
89
+ },
90
+ {
91
+ "cell_type": "code",
92
+ "execution_count": 15,
93
+ "metadata": {},
94
+ "outputs": [
95
+ {
96
+ "name": "stdout",
97
+ "output_type": "stream",
98
+ "text": [
99
+ "[2024-05-23 07:53:04,706: INFO: common: Yaml file: config\\config.yaml loaded successfully]\n",
100
+ "[2024-05-23 07:53:04,709: INFO: common: Yaml file: params.yaml loaded successfully]\n",
101
+ "[2024-05-23 07:53:04,710: INFO: common: Directory created successfully at: artifacts]\n",
102
+ "[2024-05-23 07:53:04,711: INFO: common: Directory created successfully at: artifacts/data_ingestion]\n",
103
+ "[2024-05-23 07:53:04,711: INFO: 368978256: File already exists of size: ~0 KB]\n"
104
+ ]
105
+ },
106
+ {
107
+ "name": "stderr",
108
+ "output_type": "stream",
109
+ "text": [
110
+ "Downloading data: 100%|██████████| 295M/295M [00:34<00:00, 8.46MB/s] \n",
111
+ "Downloading data: 100%|██████████| 28.3M/28.3M [00:05<00:00, 5.38MB/s]\n",
112
+ "Downloading data: 100%|██████████| 39.5M/39.5M [00:06<00:00, 5.72MB/s]\n",
113
+ "Downloading data: 100%|██████████| 40.1M/40.1M [00:06<00:00, 5.83MB/s]\n",
114
+ "Generating train split: 100%|██████████| 44972/44972 [00:03<00:00, 13618.69 examples/s]\n",
115
+ "Generating validation split: 100%|██████████| 5622/5622 [00:00<00:00, 25120.36 examples/s]\n",
116
+ "Generating test split: 100%|██████████| 5622/5622 [00:00<00:00, 22323.24 examples/s]\n",
117
+ "Saving the dataset (2/2 shards): 100%|██████████| 44972/44972 [00:07<00:00, 5653.51 examples/s] \n",
118
+ "Saving the dataset (1/1 shards): 100%|██████████| 5622/5622 [00:00<00:00, 15343.69 examples/s]\n",
119
+ "Saving the dataset (1/1 shards): 100%|██████████| 5622/5622 [00:00<00:00, 15216.24 examples/s]"
120
+ ]
121
+ },
122
+ {
123
+ "name": "stdout",
124
+ "output_type": "stream",
125
+ "text": [
126
+ "[2024-05-23 07:54:25,968: INFO: 368978256: multi_news downloaded!]\n"
127
+ ]
128
+ },
129
+ {
130
+ "name": "stderr",
131
+ "output_type": "stream",
132
+ "text": [
133
+ "\n"
134
+ ]
135
+ }
136
+ ],
137
+ "source": [
138
+ "try:\n",
139
+ " config = ConfigurationManager()\n",
140
+ " data_ingestion_config = config.get_data_ingestion_config()\n",
141
+ " data_ingestion = DataIngestion(config=data_ingestion_config)\n",
142
+ " data_ingestion.download_data()\n",
143
+ "except Exception as e:\n",
144
+ " raise e "
145
+ ]
146
+ },
147
+ {
148
+ "cell_type": "code",
149
+ "execution_count": null,
150
+ "metadata": {},
151
+ "outputs": [],
152
+ "source": []
153
+ },
154
+ {
155
+ "cell_type": "code",
156
+ "execution_count": null,
157
+ "metadata": {},
158
+ "outputs": [],
159
+ "source": []
160
+ }
161
+ ],
162
+ "metadata": {
163
+ "kernelspec": {
164
+ "display_name": "env",
165
+ "language": "python",
166
+ "name": "python3"
167
+ },
168
+ "language_info": {
169
+ "codemirror_mode": {
170
+ "name": "ipython",
171
+ "version": 3
172
+ },
173
+ "file_extension": ".py",
174
+ "mimetype": "text/x-python",
175
+ "name": "python",
176
+ "nbconvert_exporter": "python",
177
+ "pygments_lexer": "ipython3",
178
+ "version": "3.12.2"
179
+ }
180
+ },
181
+ "nbformat": 4,
182
+ "nbformat_minor": 2
183
+ }
research/data_transformation.ipynb ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os\n",
10
+ "\n",
11
+ "os.chdir(\"../\")"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 2,
17
+ "metadata": {},
18
+ "outputs": [],
19
+ "source": [
20
+ "from dataclasses import dataclass\n",
21
+ "from pathlib import Path\n",
22
+ "\n",
23
+ "@dataclass(frozen=True)\n",
24
+ "class DataTransformationConfig:\n",
25
+ " root_dir: Path\n",
26
+ " data_path: Path\n",
27
+ " ALL_REQUIRED_DATA: Path\n",
28
+ " tokenizer_name: Path"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": 3,
34
+ "metadata": {},
35
+ "outputs": [],
36
+ "source": [
37
+ "from summarylm.constants import *\n",
38
+ "from summarylm.utils.common import read_yaml, create_directories"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": 4,
44
+ "metadata": {},
45
+ "outputs": [],
46
+ "source": [
47
+ "class ConfigurationManager:\n",
48
+ " def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH):\n",
49
+ " self.config = read_yaml(config_filepath)\n",
50
+ " self.params = read_yaml(params_filepath)\n",
51
+ "\n",
52
+ " create_directories([self.config.artifacts_root])\n",
53
+ "\n",
54
+ " def get_data_transformation_config(self) -> DataTransformationConfig:\n",
55
+ " config = self.config.data_transformation\n",
56
+ "\n",
57
+ " create_directories([config.root_dir])\n",
58
+ "\n",
59
+ " data_transformation_config = DataTransformationConfig(\n",
60
+ " root_dir=config.root_dir,\n",
61
+ " data_path=config.data_path,\n",
62
+ " ALL_REQUIRED_DATA=config.ALL_REQUIRED_DATA,\n",
63
+ " tokenizer_name=config.tokenizer_name\n",
64
+ " )\n",
65
+ "\n",
66
+ " return data_transformation_config"
67
+ ]
68
+ },
69
+ {
70
+ "cell_type": "code",
71
+ "execution_count": 7,
72
+ "metadata": {},
73
+ "outputs": [],
74
+ "source": [
75
+ "import os\n",
76
+ "import sys\n",
77
+ "from summarylm.logging import logger\n",
78
+ "from summarylm.exception import CustomException\n",
79
+ "from transformers import AutoTokenizer\n",
80
+ "from datasets import load_dataset, load_from_disk\n",
81
+ "from datasets import concatenate_datasets, DatasetDict"
82
+ ]
83
+ },
84
+ {
85
+ "cell_type": "code",
86
+ "execution_count": 24,
87
+ "metadata": {},
88
+ "outputs": [],
89
+ "source": [
90
+ "class DataTransformation:\n",
91
+ " def __init__(self, config: DataTransformationConfig):\n",
92
+ " self.config = config\n",
93
+ " self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)\n",
94
+ "\n",
95
+ " def convert_data_into_right_format(self, datasets: list) -> DatasetDict:\n",
96
+ " # loading all datasets\n",
97
+ " loaded_datasets = {}\n",
98
+ " print(\"Loading the dataset\")\n",
99
+ " for data in datasets:\n",
100
+ " loaded_datasets[data] = load_from_disk(data)\n",
101
+ "\n",
102
+ " dataset1 = loaded_datasets[datasets[0]]\n",
103
+ " dataset2 = loaded_datasets[datasets[1]]\n",
104
+ " print(\"Dataset loaded\")\n",
105
+ "\n",
106
+ " # removing unwanted columns from dataset1\n",
107
+ " dataset1_train = dataset1['train'].select_columns(['article', 'summary'])\n",
108
+ " dataset1_test = dataset1['test'].select_columns(['article', 'summary'])\n",
109
+ " dataset1_validation = dataset1['validation'].select_columns(['article', 'summary'])\n",
110
+ "\n",
111
+ " # renaming data column name of dataset1\n",
112
+ " dataset1_train = dataset1_train.rename_column('article', 'text')\n",
113
+ " dataset1_test = dataset1_test.rename_column('article', 'text')\n",
114
+ " dataset1_validation = dataset1_validation.rename_column('article', 'text')\n",
115
+ "\n",
116
+ " # renaming data column name of dataset2\n",
117
+ " dataset2_train = dataset2['train'].rename_column('document', 'text')\n",
118
+ " dataset2_test = dataset2['test'].rename_column('document', 'text')\n",
119
+ " dataset2_validation = dataset2['validation'].rename_column('document', 'text')\n",
120
+ "\n",
121
+ " # concatenate_datasets\n",
122
+ " dataset_train = concatenate_datasets([dataset1_train, dataset2_train])\n",
123
+ " dataset_test = concatenate_datasets([dataset1_test, dataset2_test])\n",
124
+ " dataset_validation = concatenate_datasets([dataset1_validation, dataset2_validation])\n",
125
+ "\n",
126
+ " # loading teh dataset into DatasetDict\n",
127
+ " dataset = DatasetDict({\n",
128
+ " \"train\": dataset_train,\n",
129
+ " \"validation\": dataset_validation,\n",
130
+ " \"test\": dataset_test,\n",
131
+ " })\n",
132
+ "\n",
133
+ " return dataset\n",
134
+ "\n",
135
+ " def convert_examples_to_features(self, example_batch):\n",
136
+ " input_encodings = self.tokenizer(example_batch['text'], max_length = 1024, truncation = True)\n",
137
+ " \n",
138
+ " with self.tokenizer.as_target_tokenizer():\n",
139
+ " target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True)\n",
140
+ " \n",
141
+ " return {\n",
142
+ " 'input_ids': input_encodings['input_ids'],\n",
143
+ " 'attention_mask': input_encodings['attention_mask'],\n",
144
+ " 'labels': target_encodings['input_ids']\n",
145
+ " }\n",
146
+ " \n",
147
+ " def convert(self):\n",
148
+ " data1 = os.path.join(self.config.data_path, self.config.ALL_REQUIRED_DATA[0])\n",
149
+ " data2 = os.path.join(self.config.data_path, self.config.ALL_REQUIRED_DATA[1])\n",
150
+ "\n",
151
+ " dataset = self.convert_data_into_right_format([data1, data2])\n",
152
+ " dataset_pt = dataset.map(self.convert_examples_to_features, batched=True)\n",
153
+ " dataset_pt.save_to_disk(os.path.join(self.config.root_dir, \"dataset\"))"
154
+ ]
155
+ },
156
+ {
157
+ "cell_type": "code",
158
+ "execution_count": 25,
159
+ "metadata": {},
160
+ "outputs": [
161
+ {
162
+ "name": "stdout",
163
+ "output_type": "stream",
164
+ "text": [
165
+ "[2024-05-23 09:04:24,048: INFO: common: Yaml file: config\\config.yaml loaded successfully]\n",
166
+ "[2024-05-23 09:04:24,051: INFO: common: Yaml file: params.yaml loaded successfully]\n",
167
+ "[2024-05-23 09:04:24,052: INFO: common: Directory created successfully at: artifacts]\n",
168
+ "[2024-05-23 09:04:24,053: INFO: common: Directory created successfully at: artifacts/data_transformation]\n",
169
+ "Loading the dataset\n",
170
+ "Dataset loaded\n"
171
+ ]
172
+ },
173
+ {
174
+ "name": "stderr",
175
+ "output_type": "stream",
176
+ "text": [
177
+ "Map: 0%| | 0/80747 [00:00<?, ? examples/s]d:\\Satyam Mishra\\NLP Project\\Text Summarization\\env\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:3921: UserWarning: `as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your labels by using the argument `text_target` of the regular `__call__` method (either in the same call as your input texts if you use the same keyword arguments, or in a separate call.\n",
178
+ " warnings.warn(\n",
179
+ "Map: 100%|██████████| 80747/80747 [11:43<00:00, 114.72 examples/s]\n",
180
+ "Map: 100%|██████████| 7622/7622 [01:20<00:00, 94.22 examples/s] \n",
181
+ "Map: 100%|██████████| 7622/7622 [01:59<00:00, 63.80 examples/s] \n",
182
+ "Saving the dataset (3/3 shards): 100%|██████████| 80747/80747 [00:13<00:00, 5803.62 examples/s] \n",
183
+ "Saving the dataset (1/1 shards): 100%|██████████| 7622/7622 [00:01<00:00, 4202.00 examples/s]\n",
184
+ "Saving the dataset (1/1 shards): 100%|██████████| 7622/7622 [00:01<00:00, 6924.25 examples/s]\n"
185
+ ]
186
+ }
187
+ ],
188
+ "source": [
189
+ "try:\n",
190
+ " config = ConfigurationManager()\n",
191
+ " data_transformation_config = config.get_data_transformation_config()\n",
192
+ " data_transformation = DataTransformation(config=data_transformation_config)\n",
193
+ " data_transformation.convert()\n",
194
+ "except Exception as e:\n",
195
+ " raise e"
196
+ ]
197
+ },
198
+ {
199
+ "cell_type": "code",
200
+ "execution_count": null,
201
+ "metadata": {},
202
+ "outputs": [],
203
+ "source": []
204
+ },
205
+ {
206
+ "cell_type": "code",
207
+ "execution_count": null,
208
+ "metadata": {},
209
+ "outputs": [],
210
+ "source": []
211
+ }
212
+ ],
213
+ "metadata": {
214
+ "kernelspec": {
215
+ "display_name": "env",
216
+ "language": "python",
217
+ "name": "python3"
218
+ },
219
+ "language_info": {
220
+ "codemirror_mode": {
221
+ "name": "ipython",
222
+ "version": 3
223
+ },
224
+ "file_extension": ".py",
225
+ "mimetype": "text/x-python",
226
+ "name": "python",
227
+ "nbconvert_exporter": "python",
228
+ "pygments_lexer": "ipython3",
229
+ "version": "3.12.2"
230
+ }
231
+ },
232
+ "nbformat": 4,
233
+ "nbformat_minor": 2
234
+ }
research/data_validation.ipynb ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": 2,
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "os.chdir(\"../\")"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": 3,
24
+ "metadata": {},
25
+ "outputs": [
26
+ {
27
+ "data": {
28
+ "text/plain": [
29
+ "'d:\\\\Satyam Mishra\\\\NLP Project\\\\Text Summarization'"
30
+ ]
31
+ },
32
+ "execution_count": 3,
33
+ "metadata": {},
34
+ "output_type": "execute_result"
35
+ }
36
+ ],
37
+ "source": [
38
+ "%pwd"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": 5,
44
+ "metadata": {},
45
+ "outputs": [],
46
+ "source": [
47
+ "from dataclasses import dataclass\n",
48
+ "from pathlib import Path\n",
49
+ "\n",
50
+ "@dataclass(frozen=True)\n",
51
+ "class DataValidationConfig:\n",
52
+ " root_dir: Path\n",
53
+ " STATUS_FILE: str\n",
54
+ " ALL_REQUIRED_DATA: list\n",
55
+ " ALL_REQUIRED_FILES: list"
56
+ ]
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "execution_count": 6,
61
+ "metadata": {},
62
+ "outputs": [],
63
+ "source": [
64
+ "from summarylm.constants import *\n",
65
+ "from summarylm.utils.common import read_yaml, create_directories"
66
+ ]
67
+ },
68
+ {
69
+ "cell_type": "code",
70
+ "execution_count": 7,
71
+ "metadata": {},
72
+ "outputs": [],
73
+ "source": [
74
+ "class ConfigurationManager:\n",
75
+ " def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH):\n",
76
+ " self.config = read_yaml(config_filepath)\n",
77
+ " self.params = read_yaml(params_filepath)\n",
78
+ "\n",
79
+ " create_directories([self.config.artifacts_root])\n",
80
+ "\n",
81
+ " def get_data_validation_config(self) -> DataValidationConfig:\n",
82
+ " config = self.config.data_validation\n",
83
+ "\n",
84
+ " create_directories([config.root_dir])\n",
85
+ "\n",
86
+ " data_validation_config = DataValidationConfig(\n",
87
+ " root_dir=config.root_dir,\n",
88
+ " STATUS_FILE=config.STATUS_FILE,\n",
89
+ " ALL_REQUIRED_DATA=config.ALL_REQUIRED_DATA,\n",
90
+ " ALL_REQUIRED_FILES=config.ALL_REQUIRED_FILES,\n",
91
+ " )\n",
92
+ "\n",
93
+ " return data_validation_config"
94
+ ]
95
+ },
96
+ {
97
+ "cell_type": "code",
98
+ "execution_count": 8,
99
+ "metadata": {},
100
+ "outputs": [],
101
+ "source": [
102
+ "import os\n",
103
+ "import sys\n",
104
+ "from summarylm.logging import logger\n",
105
+ "from summarylm.exception import CustomException"
106
+ ]
107
+ },
108
+ {
109
+ "cell_type": "code",
110
+ "execution_count": 9,
111
+ "metadata": {},
112
+ "outputs": [],
113
+ "source": [
114
+ "class DataValidation:\n",
115
+ " def __init__(self, config: DataValidationConfig):\n",
116
+ " self.config = config\n",
117
+ "\n",
118
+ " def validate_all_files_exist(self) -> bool:\n",
119
+ " try:\n",
120
+ " validation_status = None\n",
121
+ "\n",
122
+ " for data in self.config.ALL_REQUIRED_DATA:\n",
123
+ " all_files = os.listdir(os.path.join(\"artifacts\", \"data_ingestion\", data))\n",
124
+ "\n",
125
+ " for file in all_files:\n",
126
+ " if file not in self.config.ALL_REQUIRED_FILES:\n",
127
+ " validation_status = False\n",
128
+ "\n",
129
+ " with open(self.config.STATUS_FILE, 'w') as f:\n",
130
+ " f.write(f\"Validation status: {validation_status}\")\n",
131
+ " else:\n",
132
+ " validation_status = True\n",
133
+ "\n",
134
+ " with open(self.config.STATUS_FILE, 'w') as f:\n",
135
+ " f.write(f\"Validation status: {validation_status}\")\n",
136
+ "\n",
137
+ " return validation_status\n",
138
+ " except Exception as e:\n",
139
+ " raise CustomException(e, sys) from e"
140
+ ]
141
+ },
142
+ {
143
+ "cell_type": "code",
144
+ "execution_count": 10,
145
+ "metadata": {},
146
+ "outputs": [
147
+ {
148
+ "name": "stdout",
149
+ "output_type": "stream",
150
+ "text": [
151
+ "[2024-05-23 08:11:43,852: INFO: common: Yaml file: config\\config.yaml loaded successfully]\n",
152
+ "[2024-05-23 08:11:43,856: INFO: common: Yaml file: params.yaml loaded successfully]\n",
153
+ "[2024-05-23 08:11:43,857: INFO: common: Directory created successfully at: artifacts]\n",
154
+ "[2024-05-23 08:11:43,858: INFO: common: Directory created successfully at: artifacts/data_validation]\n"
155
+ ]
156
+ }
157
+ ],
158
+ "source": [
159
+ "try:\n",
160
+ " config = ConfigurationManager()\n",
161
+ " data_validation_config = config.get_data_validation_config()\n",
162
+ " data_validation = DataValidation(config=data_validation_config)\n",
163
+ " data_validation.validate_all_files_exist()\n",
164
+ "except Exception as e:\n",
165
+ " raise CustomException(e, sys) from e"
166
+ ]
167
+ },
168
+ {
169
+ "cell_type": "code",
170
+ "execution_count": null,
171
+ "metadata": {},
172
+ "outputs": [],
173
+ "source": []
174
+ }
175
+ ],
176
+ "metadata": {
177
+ "kernelspec": {
178
+ "display_name": "env",
179
+ "language": "python",
180
+ "name": "python3"
181
+ },
182
+ "language_info": {
183
+ "codemirror_mode": {
184
+ "name": "ipython",
185
+ "version": 3
186
+ },
187
+ "file_extension": ".py",
188
+ "mimetype": "text/x-python",
189
+ "name": "python",
190
+ "nbconvert_exporter": "python",
191
+ "pygments_lexer": "ipython3",
192
+ "version": "3.12.2"
193
+ }
194
+ },
195
+ "nbformat": 4,
196
+ "nbformat_minor": 2
197
+ }
research/experiment.ipynb ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from ensure import ensure_annotations"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": 5,
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "@ensure_annotations\n",
19
+ "def multiply(a: int, b: int) -> int:\n",
20
+ " return a * b"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": 6,
26
+ "metadata": {},
27
+ "outputs": [
28
+ {
29
+ "ename": "EnsureError",
30
+ "evalue": "Argument b of type <class 'str'> to <function multiply at 0x000001A07A809D00> does not match annotation type <class 'int'>",
31
+ "output_type": "error",
32
+ "traceback": [
33
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
34
+ "\u001b[1;31mEnsureError\u001b[0m Traceback (most recent call last)",
35
+ "Cell \u001b[1;32mIn[6], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[43mmultiply\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m2\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m3\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
36
+ "File \u001b[1;32md:\\Satyam Mishra\\NLP Project\\Text Summarization\\env\\Lib\\site-packages\\ensure\\main.py:870\u001b[0m, in \u001b[0;36mWrappedFunctionReturn.__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 868\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(value, templ):\n\u001b[0;32m 869\u001b[0m msg \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mArgument \u001b[39m\u001b[38;5;132;01m{arg}\u001b[39;00m\u001b[38;5;124m of type \u001b[39m\u001b[38;5;132;01m{valt}\u001b[39;00m\u001b[38;5;124m to \u001b[39m\u001b[38;5;132;01m{f}\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdoes not match annotation type \u001b[39m\u001b[38;5;132;01m{t}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m--> 870\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m EnsureError(msg\u001b[38;5;241m.\u001b[39mformat(arg\u001b[38;5;241m=\u001b[39marg, f\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mf, t\u001b[38;5;241m=\u001b[39mtempl, valt\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mtype\u001b[39m(value)))\n\u001b[0;32m 872\u001b[0m return_val \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mf(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 873\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(return_val, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturn_templ):\n",
37
+ "\u001b[1;31mEnsureError\u001b[0m: Argument b of type <class 'str'> to <function multiply at 0x000001A07A809D00> does not match annotation type <class 'int'>"
38
+ ]
39
+ }
40
+ ],
41
+ "source": [
42
+ "multiply(2, \"3\")"
43
+ ]
44
+ },
45
+ {
46
+ "cell_type": "code",
47
+ "execution_count": null,
48
+ "metadata": {},
49
+ "outputs": [],
50
+ "source": []
51
+ }
52
+ ],
53
+ "metadata": {
54
+ "kernelspec": {
55
+ "display_name": "env",
56
+ "language": "python",
57
+ "name": "python3"
58
+ },
59
+ "language_info": {
60
+ "codemirror_mode": {
61
+ "name": "ipython",
62
+ "version": 3
63
+ },
64
+ "file_extension": ".py",
65
+ "mimetype": "text/x-python",
66
+ "name": "python",
67
+ "nbconvert_exporter": "python",
68
+ "pygments_lexer": "ipython3",
69
+ "version": "3.12.2"
70
+ }
71
+ },
72
+ "nbformat": 4,
73
+ "nbformat_minor": 2
74
+ }
research/model_evaluation.ipynb ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": 2,
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "os.chdir(\"../\")"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": 3,
24
+ "metadata": {},
25
+ "outputs": [
26
+ {
27
+ "data": {
28
+ "text/plain": [
29
+ "'d:\\\\Satyam Mishra\\\\NLP Project\\\\Text Summarization'"
30
+ ]
31
+ },
32
+ "execution_count": 3,
33
+ "metadata": {},
34
+ "output_type": "execute_result"
35
+ }
36
+ ],
37
+ "source": [
38
+ "%pwd"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": 4,
44
+ "metadata": {},
45
+ "outputs": [],
46
+ "source": [
47
+ "from dataclasses import dataclass\n",
48
+ "from pathlib import Path\n",
49
+ "\n",
50
+ "@dataclass(frozen=True)\n",
51
+ "class ModelEvaluationConfig:\n",
52
+ " root_dir: Path\n",
53
+ " data_path: Path\n",
54
+ " model_path: Path\n",
55
+ " tokenizer_path: Path\n",
56
+ " metric_file_name: Path"
57
+ ]
58
+ },
59
+ {
60
+ "cell_type": "code",
61
+ "execution_count": 5,
62
+ "metadata": {},
63
+ "outputs": [],
64
+ "source": [
65
+ "from summarylm.constants import *\n",
66
+ "from summarylm.utils.common import read_yaml, create_directories"
67
+ ]
68
+ },
69
+ {
70
+ "cell_type": "code",
71
+ "execution_count": 7,
72
+ "metadata": {},
73
+ "outputs": [],
74
+ "source": [
75
+ "class ConfigurationManager:\n",
76
+ " def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH):\n",
77
+ " self.config = read_yaml(config_filepath)\n",
78
+ " self.params = read_yaml(params_filepath)\n",
79
+ "\n",
80
+ " create_directories([self.config.artifacts_root])\n",
81
+ "\n",
82
+ " def get_model_evaluation_config(self) -> ModelEvaluationConfig:\n",
83
+ " config = self.config.model_evaluation\n",
84
+ "\n",
85
+ " create_directories([config.root_dir])\n",
86
+ "\n",
87
+ " model_evaluation_config = ModelEvaluationConfig(\n",
88
+ " root_dir=config.root_dir,\n",
89
+ " data_path=config.data_path,\n",
90
+ " model_path=config.model_path,\n",
91
+ " tokenizer_path=config.tokenizer_path,\n",
92
+ " metric_file_name=config.metric_file_name,\n",
93
+ " )\n",
94
+ "\n",
95
+ " return model_evaluation_config"
96
+ ]
97
+ },
98
+ {
99
+ "cell_type": "code",
100
+ "execution_count": 9,
101
+ "metadata": {},
102
+ "outputs": [
103
+ {
104
+ "name": "stdout",
105
+ "output_type": "stream",
106
+ "text": [
107
+ "[2024-05-21 08:29:30,191: INFO: config: PyTorch version 2.3.0 available.]\n"
108
+ ]
109
+ }
110
+ ],
111
+ "source": [
112
+ "from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
113
+ "from datasets import load_dataset, load_from_disk, load_metric\n",
114
+ "import torch\n",
115
+ "import pandas as pd\n",
116
+ "from tqdm import tqdm"
117
+ ]
118
+ },
119
+ {
120
+ "cell_type": "code",
121
+ "execution_count": 10,
122
+ "metadata": {},
123
+ "outputs": [],
124
+ "source": [
125
+ "class ModelEvaluation:\n",
126
+ " def __init__(self, config: ModelEvaluationConfig):\n",
127
+ " self.config = config\n",
128
+ " \n",
129
+ " def generate_batch_size_chunks(self, list_of_elements, batch_size):\n",
130
+ " \"\"\"\n",
131
+ " Split the dataset into smaller batches that we can process simultaneously\n",
132
+ " Yield successive batch-sized chunks from list_of_elements.\n",
133
+ " \"\"\"\n",
134
+ " for i in range(0, len(list_of_elements), batch_size):\n",
135
+ " yield list_of_elements[i : i + batch_size]\n",
136
+ " \n",
137
+ " def calculate_metric_on_test_ds(self, dataset, metric, model, tokenizer, batch_size=16,\n",
138
+ " device=\"cuda\" if torch.cuda.is_available() else \"cpu\",\n",
139
+ " column_text=\"article\",\n",
140
+ " column_summary=\"highlights\"):\n",
141
+ " article_batches = list(self.generate_batch_size_chunks(dataset[column_text], batch_size))\n",
142
+ " target_batches = list(self.generate_batch_size_chunks(dataset[column_summary], batch_size))\n",
143
+ " \n",
144
+ " for article_batch, target_batch in tqdm(\n",
145
+ " zip(article_batches, target_batches), total=len(article_batches)):\n",
146
+ " \n",
147
+ " inputs = tokenizer(article_batch, max_length=1024, truncation=True, \n",
148
+ " padding=\"max_length\", return_tensors=\"pt\")\n",
149
+ " \n",
150
+ " summaries = model.generate(input_ids=inputs[\"input_ids\"].to(device),\n",
151
+ " attention_mask=inputs[\"attention_mask\"].to(device), \n",
152
+ " length_penalty=0.8, num_beams=8, max_length=128)\n",
153
+ " \n",
154
+ " ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''\n",
155
+ " \n",
156
+ " # Finally, we decode the generated texts, \n",
157
+ " # replace the token, and add the decoded texts with the references to the metric.\n",
158
+ " decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, \n",
159
+ " clean_up_tokenization_spaces=True) for s in summaries] \n",
160
+ " \n",
161
+ " decoded_summaries = [d.replace(\"\", \" \") for d in decoded_summaries]\n",
162
+ " \n",
163
+ " \n",
164
+ " metric.add_batch(predictions=decoded_summaries, references=target_batch)\n",
165
+ " \n",
166
+ " # Finally compute and return the ROUGE scores.\n",
167
+ " score = metric.compute()\n",
168
+ " return score\n",
169
+ " \n",
170
+ " def evaluation(self):\n",
171
+ " device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
172
+ " tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)\n",
173
+ " model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path).to(device)\n",
174
+ "\n",
175
+ " # loading data\n",
176
+ " dataset_pt = load_from_disk(self.config.data_path)\n",
177
+ "\n",
178
+ " rouge_names = [\"rouge1\", \"rouge2\", \"rougeL\", \"rougeLsum\"]\n",
179
+ " \n",
180
+ " rouge_metric = load_metric('rouge')\n",
181
+ "\n",
182
+ " score = self.calculate_metric_on_test_ds(dataset_pt['test'][0:10], rouge_metric, model_pegasus, tokenizer, batch_size = 2, column_text = 'text', column_summary= 'summary')\n",
183
+ " rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )\n",
184
+ " df = pd.DataFrame(rouge_dict, index = ['pegasus'] )\n",
185
+ " df.to_csv(self.config.metric_file_name, index=False)"
186
+ ]
187
+ },
188
+ {
189
+ "cell_type": "code",
190
+ "execution_count": 12,
191
+ "metadata": {},
192
+ "outputs": [
193
+ {
194
+ "name": "stdout",
195
+ "output_type": "stream",
196
+ "text": [
197
+ "[2024-05-21 08:43:47,280: INFO: common: Yaml file: config\\config.yaml loaded successfully]\n",
198
+ "[2024-05-21 08:43:47,284: INFO: common: Yaml file: params.yaml loaded successfully]\n",
199
+ "[2024-05-21 08:43:47,285: INFO: common: Directory created successfully at: artifacts]\n",
200
+ "[2024-05-21 08:43:47,286: INFO: common: Directory created successfully at: artifacts/model_evaluation]\n"
201
+ ]
202
+ },
203
+ {
204
+ "ename": "OSError",
205
+ "evalue": "Incorrect path_or_model_id: 'artifacts/model_trainer/tokenizer'. Please provide either the path to a local folder or the repo_id of a model on the Hub.",
206
+ "output_type": "error",
207
+ "traceback": [
208
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
209
+ "\u001b[1;31mHFValidationError\u001b[0m Traceback (most recent call last)",
210
+ "File \u001b[1;32md:\\Satyam Mishra\\NLP Project\\Text Summarization\\env\\Lib\\site-packages\\transformers\\utils\\hub.py:398\u001b[0m, in \u001b[0;36mcached_file\u001b[1;34m(path_or_repo_id, filename, cache_dir, force_download, resume_download, proxies, token, revision, local_files_only, subfolder, repo_type, user_agent, _raise_exceptions_for_gated_repo, _raise_exceptions_for_missing_entries, _raise_exceptions_for_connection_errors, _commit_hash, **deprecated_kwargs)\u001b[0m\n\u001b[0;32m 396\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 397\u001b[0m \u001b[38;5;66;03m# Load from URL or cache if already cached\u001b[39;00m\n\u001b[1;32m--> 398\u001b[0m resolved_file \u001b[38;5;241m=\u001b[39m \u001b[43mhf_hub_download\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 399\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath_or_repo_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 400\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 401\u001b[0m \u001b[43m \u001b[49m\u001b[43msubfolder\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43msubfolder\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43msubfolder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 402\u001b[0m \u001b[43m \u001b[49m\u001b[43mrepo_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrepo_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 403\u001b[0m \u001b[43m \u001b[49m\u001b[43mrevision\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 404\u001b[0m \u001b[43m \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 405\u001b[0m \u001b[43m \u001b[49m\u001b[43muser_agent\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muser_agent\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 406\u001b[0m \u001b[43m \u001b[49m\u001b[43mforce_download\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mforce_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 407\u001b[0m \u001b[43m \u001b[49m\u001b[43mproxies\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mproxies\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 408\u001b[0m \u001b[43m \u001b[49m\u001b[43mresume_download\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 409\u001b[0m \u001b[43m \u001b[49m\u001b[43mtoken\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 410\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 411\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 412\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m GatedRepoError \u001b[38;5;28;01mas\u001b[39;00m e:\n",
211
+ "File \u001b[1;32md:\\Satyam Mishra\\NLP Project\\Text Summarization\\env\\Lib\\site-packages\\huggingface_hub\\utils\\_validators.py:106\u001b[0m, in \u001b[0;36mvalidate_hf_hub_args.<locals>._inner_fn\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 105\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m arg_name \u001b[38;5;129;01min\u001b[39;00m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrepo_id\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfrom_id\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mto_id\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n\u001b[1;32m--> 106\u001b[0m \u001b[43mvalidate_repo_id\u001b[49m\u001b[43m(\u001b[49m\u001b[43marg_value\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 108\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m arg_name \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtoken\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m arg_value \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
212
+ "File \u001b[1;32md:\\Satyam Mishra\\NLP Project\\Text Summarization\\env\\Lib\\site-packages\\huggingface_hub\\utils\\_validators.py:154\u001b[0m, in \u001b[0;36mvalidate_repo_id\u001b[1;34m(repo_id)\u001b[0m\n\u001b[0;32m 153\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m repo_id\u001b[38;5;241m.\u001b[39mcount(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m--> 154\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m HFValidationError(\n\u001b[0;32m 155\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRepo id must be in the form \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrepo_name\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m or \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mnamespace/repo_name\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m:\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 156\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrepo_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m. Use `repo_type` argument if needed.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 157\u001b[0m )\n\u001b[0;32m 159\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m REPO_ID_REGEX\u001b[38;5;241m.\u001b[39mmatch(repo_id):\n",
213
+ "\u001b[1;31mHFValidationError\u001b[0m: Repo id must be in the form 'repo_name' or 'namespace/repo_name': 'artifacts/model_trainer/tokenizer'. Use `repo_type` argument if needed.",
214
+ "\nThe above exception was the direct cause of the following exception:\n",
215
+ "\u001b[1;31mOSError\u001b[0m Traceback (most recent call last)",
216
+ "Cell \u001b[1;32mIn[12], line 7\u001b[0m\n\u001b[0;32m 5\u001b[0m model_evaluation_config\u001b[38;5;241m.\u001b[39mevaluation()\n\u001b[0;32m 6\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m----> 7\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n",
217
+ "Cell \u001b[1;32mIn[12], line 5\u001b[0m\n\u001b[0;32m 3\u001b[0m model_evaluation_config \u001b[38;5;241m=\u001b[39m config\u001b[38;5;241m.\u001b[39mget_model_evaluation_config()\n\u001b[0;32m 4\u001b[0m model_evaluation_config \u001b[38;5;241m=\u001b[39m ModelEvaluation(config\u001b[38;5;241m=\u001b[39mmodel_evaluation_config)\n\u001b[1;32m----> 5\u001b[0m \u001b[43mmodel_evaluation_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mevaluation\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 6\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m 7\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n",
218
+ "Cell \u001b[1;32mIn[10], line 48\u001b[0m, in \u001b[0;36mModelEvaluation.evaluation\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 46\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mevaluation\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m 47\u001b[0m device \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcuda\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mcuda\u001b[38;5;241m.\u001b[39mis_available() \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcpu\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m---> 48\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m \u001b[43mAutoTokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtokenizer_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 49\u001b[0m model_pegasus \u001b[38;5;241m=\u001b[39m AutoModelForSeq2SeqLM\u001b[38;5;241m.\u001b[39mfrom_pretrained(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mmodel_path)\u001b[38;5;241m.\u001b[39mto(device)\n\u001b[0;32m 51\u001b[0m \u001b[38;5;66;03m# loading data\u001b[39;00m\n",
219
+ "File \u001b[1;32md:\\Satyam Mishra\\NLP Project\\Text Summarization\\env\\Lib\\site-packages\\transformers\\models\\auto\\tokenization_auto.py:804\u001b[0m, in \u001b[0;36mAutoTokenizer.from_pretrained\u001b[1;34m(cls, pretrained_model_name_or_path, *inputs, **kwargs)\u001b[0m\n\u001b[0;32m 801\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m tokenizer_class\u001b[38;5;241m.\u001b[39mfrom_pretrained(pretrained_model_name_or_path, \u001b[38;5;241m*\u001b[39minputs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 803\u001b[0m \u001b[38;5;66;03m# Next, let's try to use the tokenizer_config file to get the tokenizer class.\u001b[39;00m\n\u001b[1;32m--> 804\u001b[0m tokenizer_config \u001b[38;5;241m=\u001b[39m \u001b[43mget_tokenizer_config\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 805\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_commit_hash\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m tokenizer_config:\n\u001b[0;32m 806\u001b[0m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_commit_hash\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m tokenizer_config[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_commit_hash\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n",
220
+ "File \u001b[1;32md:\\Satyam Mishra\\NLP Project\\Text Summarization\\env\\Lib\\site-packages\\transformers\\models\\auto\\tokenization_auto.py:637\u001b[0m, in \u001b[0;36mget_tokenizer_config\u001b[1;34m(pretrained_model_name_or_path, cache_dir, force_download, resume_download, proxies, token, revision, local_files_only, subfolder, **kwargs)\u001b[0m\n\u001b[0;32m 634\u001b[0m token \u001b[38;5;241m=\u001b[39m use_auth_token\n\u001b[0;32m 636\u001b[0m commit_hash \u001b[38;5;241m=\u001b[39m kwargs\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_commit_hash\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m--> 637\u001b[0m resolved_config_file \u001b[38;5;241m=\u001b[39m \u001b[43mcached_file\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 638\u001b[0m \u001b[43m \u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 639\u001b[0m \u001b[43m \u001b[49m\u001b[43mTOKENIZER_CONFIG_FILE\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 640\u001b[0m \u001b[43m \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 641\u001b[0m \u001b[43m \u001b[49m\u001b[43mforce_download\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mforce_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 642\u001b[0m \u001b[43m \u001b[49m\u001b[43mresume_download\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 643\u001b[0m \u001b[43m \u001b[49m\u001b[43mproxies\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mproxies\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 644\u001b[0m \u001b[43m \u001b[49m\u001b[43mtoken\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 645\u001b[0m \u001b[43m \u001b[49m\u001b[43mrevision\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 646\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 647\u001b[0m \u001b[43m \u001b[49m\u001b[43msubfolder\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msubfolder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 648\u001b[0m \u001b[43m \u001b[49m\u001b[43m_raise_exceptions_for_gated_repo\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 649\u001b[0m \u001b[43m \u001b[49m\u001b[43m_raise_exceptions_for_missing_entries\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 650\u001b[0m \u001b[43m \u001b[49m\u001b[43m_raise_exceptions_for_connection_errors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 651\u001b[0m \u001b[43m \u001b[49m\u001b[43m_commit_hash\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcommit_hash\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 652\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 653\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m resolved_config_file \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 654\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCould not locate the tokenizer configuration file, will try to use the model config instead.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
221
+ "File \u001b[1;32md:\\Satyam Mishra\\NLP Project\\Text Summarization\\env\\Lib\\site-packages\\transformers\\utils\\hub.py:462\u001b[0m, in \u001b[0;36mcached_file\u001b[1;34m(path_or_repo_id, filename, cache_dir, force_download, resume_download, proxies, token, revision, local_files_only, subfolder, repo_type, user_agent, _raise_exceptions_for_gated_repo, _raise_exceptions_for_missing_entries, _raise_exceptions_for_connection_errors, _commit_hash, **deprecated_kwargs)\u001b[0m\n\u001b[0;32m 460\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mEnvironmentError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThere was a specific connection error when trying to load \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpath_or_repo_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00merr\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 461\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m HFValidationError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m--> 462\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mEnvironmentError\u001b[39;00m(\n\u001b[0;32m 463\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIncorrect path_or_model_id: \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpath_or_repo_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m. Please provide either the path to a local folder or the repo_id of a model on the Hub.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 464\u001b[0m ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n\u001b[0;32m 465\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m resolved_file\n",
222
+ "\u001b[1;31mOSError\u001b[0m: Incorrect path_or_model_id: 'artifacts/model_trainer/tokenizer'. Please provide either the path to a local folder or the repo_id of a model on the Hub."
223
+ ]
224
+ }
225
+ ],
226
+ "source": [
227
+ "try:\n",
228
+ " config = ConfigurationManager()\n",
229
+ " model_evaluation_config = config.get_model_evaluation_config()\n",
230
+ " model_evaluation_config = ModelEvaluation(config=model_evaluation_config)\n",
231
+ " model_evaluation_config.evaluation()\n",
232
+ "except Exception as e:\n",
233
+ " raise e"
234
+ ]
235
+ },
236
+ {
237
+ "cell_type": "code",
238
+ "execution_count": null,
239
+ "metadata": {},
240
+ "outputs": [],
241
+ "source": []
242
+ }
243
+ ],
244
+ "metadata": {
245
+ "kernelspec": {
246
+ "display_name": "env",
247
+ "language": "python",
248
+ "name": "python3"
249
+ },
250
+ "language_info": {
251
+ "codemirror_mode": {
252
+ "name": "ipython",
253
+ "version": 3
254
+ },
255
+ "file_extension": ".py",
256
+ "mimetype": "text/x-python",
257
+ "name": "python",
258
+ "nbconvert_exporter": "python",
259
+ "pygments_lexer": "ipython3",
260
+ "version": "3.12.2"
261
+ }
262
+ },
263
+ "nbformat": 4,
264
+ "nbformat_minor": 2
265
+ }
research/model_trainer.ipynb ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": 2,
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "os.chdir('../')"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": 3,
24
+ "metadata": {},
25
+ "outputs": [
26
+ {
27
+ "data": {
28
+ "text/plain": [
29
+ "'d:\\\\Satyam Mishra\\\\NLP Project\\\\Text Summarization'"
30
+ ]
31
+ },
32
+ "execution_count": 3,
33
+ "metadata": {},
34
+ "output_type": "execute_result"
35
+ }
36
+ ],
37
+ "source": [
38
+ "%pwd"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": 5,
44
+ "metadata": {},
45
+ "outputs": [],
46
+ "source": [
47
+ "from dataclasses import dataclass\n",
48
+ "from pathlib import Path\n",
49
+ "\n",
50
+ "@dataclass(frozen=True)\n",
51
+ "class ModelTrainerConfig:\n",
52
+ " root_dir: Path\n",
53
+ " data_path: Path\n",
54
+ " model_ckpt: Path\n",
55
+ " num_train_epochs: int\n",
56
+ " warmup_steps: int\n",
57
+ " per_device_train_batch_size: int\n",
58
+ " weight_decay: float\n",
59
+ " logging_steps: int\n",
60
+ " evaluation_strategy: str\n",
61
+ " eval_steps: int\n",
62
+ " save_steps: float\n",
63
+ " gradient_accumulation_steps: int"
64
+ ]
65
+ },
66
+ {
67
+ "cell_type": "code",
68
+ "execution_count": 6,
69
+ "metadata": {},
70
+ "outputs": [],
71
+ "source": [
72
+ "from summarylm.constants import *\n",
73
+ "from summarylm.utils.common import read_yaml, create_directories"
74
+ ]
75
+ },
76
+ {
77
+ "cell_type": "code",
78
+ "execution_count": 9,
79
+ "metadata": {},
80
+ "outputs": [],
81
+ "source": [
82
+ "class ConfigurationManager:\n",
83
+ " def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH):\n",
84
+ " self.config = read_yaml(config_filepath)\n",
85
+ " self.params = read_yaml(params_filepath)\n",
86
+ "\n",
87
+ " create_directories([self.config.artifacts_root])\n",
88
+ "\n",
89
+ " def get_model_trainer_config(self) -> ModelTrainerConfig:\n",
90
+ " config = self.config.model_trainer\n",
91
+ " params = self.params.TrainingArguments\n",
92
+ "\n",
93
+ " create_directories([config.root_dir])\n",
94
+ "\n",
95
+ " model_trainer_config = ModelTrainerConfig(\n",
96
+ " root_dir=config.root_dir,\n",
97
+ " data_path=config.data_path,\n",
98
+ " model_ckpt=config.model_ckpt,\n",
99
+ " num_train_epochs=params.num_train_epochs,\n",
100
+ " warmup_steps=params.warmup_steps,\n",
101
+ " per_device_train_batch_size=params.per_device_train_batch_size,\n",
102
+ " weight_decay=params.weight_decay,\n",
103
+ " logging_steps=params.logging_steps,\n",
104
+ " evaluation_strategy=params.evaluation_strategy,\n",
105
+ " eval_steps=params.eval_steps,\n",
106
+ " save_steps=params.save_steps,\n",
107
+ " gradient_accumulation_steps=params.gradient_accumulation_steps,\n",
108
+ " )\n",
109
+ "\n",
110
+ " return model_trainer_config"
111
+ ]
112
+ },
113
+ {
114
+ "cell_type": "code",
115
+ "execution_count": 11,
116
+ "metadata": {},
117
+ "outputs": [],
118
+ "source": [
119
+ "from transformers import TrainingArguments, Trainer\n",
120
+ "from transformers import DataCollatorForSeq2Seq\n",
121
+ "from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
122
+ "from datasets import load_dataset, load_from_disk\n",
123
+ "import torch"
124
+ ]
125
+ },
126
+ {
127
+ "cell_type": "code",
128
+ "execution_count": 12,
129
+ "metadata": {},
130
+ "outputs": [],
131
+ "source": [
132
+ "class ModelTrainer:\n",
133
+ " def __init__(self, config: ModelTrainerConfig):\n",
134
+ " self.config = config\n",
135
+ "\n",
136
+ " def train(self):\n",
137
+ " device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
138
+ " tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)\n",
139
+ " model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)\n",
140
+ " seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)\n",
141
+ "\n",
142
+ " # loading the dataset\n",
143
+ " dataset_pt = load_from_disk(self.config.data_path)\n",
144
+ "\n",
145
+ " trainer_args = TrainingArguments(\n",
146
+ " output_dir=self.config.root_dir,\n",
147
+ " num_train_epochs=self.config.num_train_epochs,\n",
148
+ " warmup_steps=self.config.warmup_steps,\n",
149
+ " per_device_train_batch_size=self.config.per_device_train_batch_size,\n",
150
+ " per_device_eval_batch_size=self.config.per_device_train_batch_size,\n",
151
+ " weight_decay=self.config.weight_decay,\n",
152
+ " logging_steps=self.config.logging_steps,\n",
153
+ " evaluation_strategy=self.config.evaluation_strategy,\n",
154
+ " eval_steps=self.config.eval_steps,\n",
155
+ " save_steps=self.config.save_steps,\n",
156
+ " gradient_accumulation_steps=self.config.gradient_accumulation_steps,\n",
157
+ " )\n",
158
+ "\n",
159
+ " trainer = Trainer(\n",
160
+ " model=model_pegasus,\n",
161
+ " args=trainer_args,\n",
162
+ " tokenizer=tokenizer, \n",
163
+ " data_collator=seq2seq_data_collator,\n",
164
+ " train_dataset=dataset_pt['train'],\n",
165
+ " eval_dataset=dataset_pt['validation']\n",
166
+ " )\n",
167
+ "\n",
168
+ " trainer.train()\n",
169
+ "\n",
170
+ " ## Save model\n",
171
+ " model_pegasus.save_pretrained(os.path.join(self.config.root_dir,\"pegasus-summary-lm\"))\n",
172
+ " ## Save tokenizer\n",
173
+ " tokenizer.save_pretrained(os.path.join(self.config.root_dir,\"tokenizer\"))"
174
+ ]
175
+ },
176
+ {
177
+ "cell_type": "code",
178
+ "execution_count": 13,
179
+ "metadata": {},
180
+ "outputs": [
181
+ {
182
+ "name": "stdout",
183
+ "output_type": "stream",
184
+ "text": [
185
+ "[2024-05-21 07:37:38,704: INFO: common: Yaml file: config\\config.yaml loaded successfully]\n",
186
+ "[2024-05-21 07:37:38,721: INFO: common: Yaml file: params.yaml loaded successfully]\n",
187
+ "[2024-05-21 07:37:38,739: INFO: common: Directory created successfully at: artifacts]\n",
188
+ "[2024-05-21 07:37:38,742: INFO: common: Directory created successfully at: artifacts/model_trainer]\n"
189
+ ]
190
+ },
191
+ {
192
+ "name": "stderr",
193
+ "output_type": "stream",
194
+ "text": [
195
+ "d:\\Satyam Mishra\\NLP Project\\Text Summarization\\env\\Lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
196
+ " warnings.warn(\n"
197
+ ]
198
+ }
199
+ ],
200
+ "source": [
201
+ "try:\n",
202
+ " config = ConfigurationManager()\n",
203
+ " model_trainer_config = config.get_model_trainer_config()\n",
204
+ " model_trainer_config = ModelTrainer(config=model_trainer_config)\n",
205
+ " model_trainer_config.train()\n",
206
+ "except Exception as e:\n",
207
+ " raise e"
208
+ ]
209
+ },
210
+ {
211
+ "cell_type": "code",
212
+ "execution_count": null,
213
+ "metadata": {},
214
+ "outputs": [],
215
+ "source": []
216
+ }
217
+ ],
218
+ "metadata": {
219
+ "kernelspec": {
220
+ "display_name": "env",
221
+ "language": "python",
222
+ "name": "python3"
223
+ },
224
+ "language_info": {
225
+ "codemirror_mode": {
226
+ "name": "ipython",
227
+ "version": 3
228
+ },
229
+ "file_extension": ".py",
230
+ "mimetype": "text/x-python",
231
+ "name": "python",
232
+ "nbconvert_exporter": "python",
233
+ "pygments_lexer": "ipython3",
234
+ "version": "3.12.2"
235
+ }
236
+ },
237
+ "nbformat": 4,
238
+ "nbformat_minor": 2
239
+ }
setup.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import setuptools
2
+
3
+ with open("README.md", "r", encoding="utf-8") as f:
4
+ long_description = f.read()
5
+
6
+ __version__ = "0.0.0"
7
+
8
+ AUTHOR_USER_NAME = "Satyam Mishra"
9
+ SRC_REPO = "SummaryLM"
10
+ AUTHOR_EMAIL = "[email protected]"
11
+
12
+ setuptools.setup(
13
+ name=SRC_REPO,
14
+ version=__version__,
15
+ author=AUTHOR_USER_NAME,
16
+ author_email=AUTHOR_EMAIL,
17
+ description='A text summarizer',
18
+ long_description=long_description,
19
+ long_description_content_type="text/markdown",
20
+ package_dir={"": "src"},
21
+ packages=setuptools.find_packages(where="src"),
22
+ )
src/summarylm/__init__.py ADDED
File without changes
src/summarylm/components/__init__.py ADDED
File without changes
src/summarylm/components/data_ingestion.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import zipfile
4
+ from pathlib import Path
5
+ from datasets import load_dataset
6
+
7
+ from summarylm.entity import DataIngestionConfig
8
+ from summarylm.utils.common import get_size
9
+ from summarylm.logging import logger
10
+ from summarylm.exception import CustomException
11
+
12
+ class DataIngestion:
13
+ """
14
+ Class for download and unzip data and store it into artifact folder
15
+
16
+ Args:
17
+ config (DataIngestionConfig): Contain all config for data ingestion
18
+
19
+ Returns:
20
+ None
21
+ """
22
+ def __init__(self, config: DataIngestionConfig):
23
+ self.config = config
24
+
25
+ def download_data(self):
26
+ """
27
+ Function to download data from gcloud
28
+ """
29
+ try:
30
+ for i in range(len(self.config.LOCAL_DATA_FILE)):
31
+ if not os.path.exists(self.config.LOCAL_DATA_FILE[i]):
32
+ dataset = load_dataset(self.config.ALL_HUGGINGFACE_DATA[i])
33
+ dataset.save_to_disk(self.config.LOCAL_DATA_FILE[i])
34
+ logger.info(f"{self.config.ALL_HUGGINGFACE_DATA[i]} downloaded!")
35
+ else:
36
+ logger.info(f"File already exists of size: {get_size(Path(self.config.LOCAL_DATA_FILE[i]))}")
37
+
38
+ except Exception as e:
39
+ raise CustomException(e, sys) from e
src/summarylm/components/data_transformation.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from summarylm.logging import logger
4
+ from summarylm.exception import CustomException
5
+ from summarylm.entity import DataTransformationConfig
6
+ from transformers import AutoTokenizer
7
+ from datasets import load_dataset, load_from_disk, concatenate_datasets, DatasetDict
8
+
9
+
10
+ class DataTransformation:
11
+ """
12
+ Class for transforming data into valid format for training
13
+
14
+ Args:
15
+ config (DataTransformationConfig): Contain all config for data transformation
16
+ """
17
+ def __init__(self, config: DataTransformationConfig):
18
+ self.config = config
19
+ self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)
20
+
21
+ def convert_data_into_right_format(self, datasets: list) -> DatasetDict:
22
+ """
23
+ Function to remove & rename columns and convert it into right format to train
24
+
25
+ Args:
26
+ datasets (list): list of all dataset path
27
+
28
+ Returns:
29
+ DatasetDict: Contains train, test, and validation sets
30
+ """
31
+ try:
32
+ logger.info("Entered convert_data_into_right_format method of DataTransformation class.")
33
+ # loading all datasets
34
+ loaded_datasets = {}
35
+ print("Loading the dataset")
36
+ for data in datasets:
37
+ loaded_datasets[data] = load_from_disk(data)
38
+
39
+ dataset1 = loaded_datasets[datasets[0]]
40
+ dataset2 = loaded_datasets[datasets[1]]
41
+ print("Dataset loaded")
42
+
43
+ # removing unwanted columns from dataset1
44
+ dataset1_train = dataset1['train'].select_columns(['article', 'summary'])
45
+ dataset1_test = dataset1['test'].select_columns(['article', 'summary'])
46
+ dataset1_validation = dataset1['validation'].select_columns(['article', 'summary'])
47
+
48
+ # renaming data column name of dataset1
49
+ dataset1_train = dataset1_train.rename_column('article', 'text')
50
+ dataset1_test = dataset1_test.rename_column('article', 'text')
51
+ dataset1_validation = dataset1_validation.rename_column('article', 'text')
52
+
53
+ # renaming data column name of dataset2
54
+ dataset2_train = dataset2['train'].rename_column('document', 'text')
55
+ dataset2_test = dataset2['test'].rename_column('document', 'text')
56
+ dataset2_validation = dataset2['validation'].rename_column('document', 'text')
57
+
58
+ # concatenate_datasets
59
+ dataset_train = concatenate_datasets([dataset1_train, dataset2_train])
60
+ dataset_test = concatenate_datasets([dataset1_test, dataset2_test])
61
+ dataset_validation = concatenate_datasets([dataset1_validation, dataset2_validation])
62
+
63
+ # loading the dataset into DatasetDict
64
+ dataset = DatasetDict({
65
+ "train": dataset_train,
66
+ "validation": dataset_validation,
67
+ "test": dataset_test,
68
+ })
69
+ return dataset
70
+
71
+ except Exception as e:
72
+ raise CustomException(e, sys) from e
73
+
74
+ def convert_examples_to_features(self, example_batch):
75
+ """
76
+ Method to convert data into data into features
77
+
78
+ Args:
79
+ example_batch: dataset after loading it from datasets library
80
+ Returns:
81
+ input_ids: A list of token ids representing the dialogue
82
+ attention_mask: List of indices specifying which tokens should be attended to by the model
83
+ labels: A list of token ids representing the summary
84
+ """
85
+ try:
86
+ logger.info("Entered convert_examples_to_features method of DataTransformation class.")
87
+ input_encodings = self.tokenizer(example_batch['text'], max_length = 1024, truncation = True)
88
+
89
+ with self.tokenizer.as_target_tokenizer():
90
+ target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True)
91
+
92
+ return {
93
+ 'input_ids': input_encodings['input_ids'],
94
+ 'attention_mask': input_encodings['attention_mask'],
95
+ 'labels': target_encodings['input_ids']
96
+ }
97
+ except Exception as e:
98
+ raise CustomException(e, sys) from e
99
+
100
+
101
+ def convert(self):
102
+ data1 = os.path.join(self.config.data_path, self.config.ALL_REQUIRED_DATA[0])
103
+ data2 = os.path.join(self.config.data_path, self.config.ALL_REQUIRED_DATA[1])
104
+
105
+ dataset = self.convert_data_into_right_format([data1, data2])
106
+ dataset_pt = dataset.map(self.convert_examples_to_features, batched=True)
107
+ dataset_pt.save_to_disk(os.path.join(self.config.root_dir, "dataset"))
src/summarylm/components/data_validation.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from summarylm.logging import logger
4
+ from summarylm.exception import CustomException
5
+ from summarylm.entity import DataValidationConfig
6
+
7
+
8
+ class DataValidation:
9
+ """
10
+ Class for validating if all data files exists in train, test, validation folders
11
+
12
+ Args:
13
+ config (DataValidationConfig): Contain all config for data validation
14
+
15
+ Returns:
16
+ validation_status (bool): true if data exists else false
17
+ """
18
+ def __init__(self, config: DataValidationConfig):
19
+ self.config = config
20
+
21
+ def validate_all_files_exist(self) -> bool:
22
+ try:
23
+ logger.info("Entered validate_all_files_exist method of DataValidation class.")
24
+ validation_status = None
25
+
26
+ for data in self.config.ALL_REQUIRED_DATA:
27
+ all_files = os.listdir(os.path.join("artifacts", "data_ingestion", data))
28
+
29
+ for file in all_files:
30
+ if file not in self.config.ALL_REQUIRED_FILES:
31
+ validation_status = False
32
+
33
+ with open(self.config.STATUS_FILE, 'w') as f:
34
+ f.write(f"Validation status: {validation_status}")
35
+ else:
36
+ validation_status = True
37
+
38
+ with open(self.config.STATUS_FILE, 'w') as f:
39
+ f.write(f"Validation status: {validation_status}")
40
+
41
+ logger.info("Completed validate_all_files_exist method of DataValidation class.")
42
+
43
+ return validation_status
44
+ except Exception as e:
45
+ raise CustomException(e, sys) from e
src/summarylm/components/model_evaluation.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from summarylm.entity import ModelEvaluationConfig
2
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
3
+ from datasets import load_dataset, load_from_disk, load_metric
4
+ import torch
5
+ import pandas as pd
6
+ from tqdm import tqdm
7
+
8
+
9
+ class ModelEvaluation:
10
+ def __init__(self, config: ModelEvaluationConfig):
11
+ self.config = config
12
+
13
+ def generate_batch_size_chunks(self, list_of_elements, batch_size):
14
+ """
15
+ Split the dataset into smaller batches that we can process simultaneously
16
+ Yield successive batch-sized chunks from list_of_elements.
17
+ """
18
+ for i in range(0, len(list_of_elements), batch_size):
19
+ yield list_of_elements[i : i + batch_size]
20
+
21
+ def calculate_metric_on_test_ds(self, dataset, metric, model, tokenizer, batch_size=16,
22
+ device="cuda" if torch.cuda.is_available() else "cpu",
23
+ column_text="article",
24
+ column_summary="highlights"):
25
+ article_batches = list(self.generate_batch_size_chunks(dataset[column_text], batch_size))
26
+ target_batches = list(self.generate_batch_size_chunks(dataset[column_summary], batch_size))
27
+
28
+ for article_batch, target_batch in tqdm(
29
+ zip(article_batches, target_batches), total=len(article_batches)):
30
+
31
+ inputs = tokenizer(article_batch, max_length=1024, truncation=True,
32
+ padding="max_length", return_tensors="pt")
33
+
34
+ summaries = model.generate(input_ids=inputs["input_ids"].to(device),
35
+ attention_mask=inputs["attention_mask"].to(device),
36
+ length_penalty=0.8, num_beams=8, max_length=128)
37
+
38
+ ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''
39
+
40
+ # Finally, we decode the generated texts,
41
+ # replace the token, and add the decoded texts with the references to the metric.
42
+ decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
43
+ clean_up_tokenization_spaces=True) for s in summaries]
44
+
45
+ decoded_summaries = [d.replace("", " ") for d in decoded_summaries]
46
+
47
+
48
+ metric.add_batch(predictions=decoded_summaries, references=target_batch)
49
+
50
+ # Finally compute and return the ROUGE scores.
51
+ score = metric.compute()
52
+ return score
53
+
54
+ def evaluation(self):
55
+ device = "cuda" if torch.cuda.is_available() else "cpu"
56
+ tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
57
+ model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path).to(device)
58
+
59
+ # loading data
60
+ dataset_pt = load_from_disk(self.config.data_path)
61
+
62
+ rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
63
+
64
+ rouge_metric = load_metric('rouge')
65
+
66
+ score = self.calculate_metric_on_test_ds(dataset_pt['test'][0:10], rouge_metric, model_pegasus, tokenizer, batch_size = 2, column_text = 'text', column_summary= 'summary')
67
+ rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )
68
+ df = pd.DataFrame(rouge_dict, index = ['pegasus'] )
69
+ df.to_csv(self.config.metric_file_name, index=False)
src/summarylm/components/model_trainer.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from summarylm.entity import ModelTrainerConfig
4
+ from summarylm.logging import logger
5
+ from summarylm.exception import CustomException
6
+ from transformers import TrainingArguments, Trainer
7
+ from transformers import DataCollatorForSeq2Seq
8
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
9
+ from datasets import load_dataset, load_from_disk
10
+ import torch
11
+
12
+
13
+ class ModelTrainer:
14
+ """
15
+ Class for training model
16
+
17
+ Args:
18
+ config (ModelTrainerConfig): Contain all config for model training
19
+ """
20
+ def __init__(self, config: ModelTrainerConfig):
21
+ self.config = config
22
+
23
+ def train(self):
24
+ """
25
+ Method to train pegasus model
26
+ """
27
+ logger.info("Entered train method of ModelTrainer class.")
28
+ try:
29
+ device = "cuda" if torch.cuda.is_available() else "cpu"
30
+ tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
31
+ model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
32
+ seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)
33
+
34
+ # loading the dataset
35
+ dataset_pt = load_from_disk(self.config.data_path)
36
+
37
+ trainer_args = TrainingArguments(
38
+ output_dir=self.config.root_dir,
39
+ num_train_epochs=self.config.num_train_epochs,
40
+ warmup_steps=self.config.warmup_steps,
41
+ per_device_train_batch_size=self.config.per_device_train_batch_size,
42
+ per_device_eval_batch_size=self.config.per_device_train_batch_size,
43
+ weight_decay=self.config.weight_decay,
44
+ logging_steps=self.config.logging_steps,
45
+ evaluation_strategy=self.config.evaluation_strategy,
46
+ eval_steps=self.config.eval_steps,
47
+ save_steps=1e6,
48
+ gradient_accumulation_steps=self.config.gradient_accumulation_steps,
49
+ )
50
+
51
+ trainer = Trainer(
52
+ model=model_pegasus,
53
+ args=trainer_args,
54
+ tokenizer=tokenizer,
55
+ data_collator=seq2seq_data_collator,
56
+ train_dataset=dataset_pt['train'],
57
+ eval_dataset=dataset_pt['validation']
58
+ )
59
+
60
+ trainer.train()
61
+
62
+ ## Save model
63
+ model_pegasus.save_pretrained(os.path.join(self.config.root_dir,"pegasus-summary-lm"))
64
+ ## Save tokenizer
65
+ tokenizer.save_pretrained(os.path.join(self.config.root_dir,"tokenizer"))
66
+
67
+ logger.info("Completed trian method of ModelTrainer class.")
68
+ except Exception as e:
69
+ raise CustomException(e, sys) from e
src/summarylm/config/__init__.py ADDED
File without changes
src/summarylm/config/configuration.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from summarylm.constants import *
2
+ from summarylm.utils.common import read_yaml, create_directories
3
+ from summarylm.entity import (DataIngestionConfig, DataValidationConfig, DataTransformationConfig, ModelTrainerConfig, ModelEvaluationConfig)
4
+
5
+ class ConfigurationManager:
6
+ """
7
+ Configuration Manager for Data Ingestion, Data Transformation
8
+
9
+ Args:
10
+ config_filepath (Path): Path to config yaml file
11
+ params_filepath (Path): Path to params yaml file
12
+
13
+ Retuns:
14
+ None
15
+ """
16
+ def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH) -> None:
17
+ self.config = read_yaml(config_filepath)
18
+ self.params = read_yaml(params_filepath)
19
+
20
+ create_directories([self.config.artifacts_root])
21
+
22
+ def get_data_ingestion_config(self) -> DataIngestionConfig:
23
+ config = self.config.data_ingestion
24
+
25
+ create_directories([config.root_dir])
26
+
27
+ data_ingestion_config = DataIngestionConfig(
28
+ root_dir=config.root_dir,
29
+ ALL_HUGGINGFACE_DATA=config.ALL_HUGGINGFACE_DATA,
30
+ LOCAL_DATA_FILE = config.LOCAL_DATA_FILE,
31
+ )
32
+
33
+
34
+ return data_ingestion_config
35
+
36
+ def get_data_validation_config(self) -> DataValidationConfig:
37
+ config = self.config.data_validation
38
+
39
+ create_directories([config.root_dir])
40
+
41
+ data_validation_config = DataValidationConfig(
42
+ root_dir=config.root_dir,
43
+ STATUS_FILE=config.STATUS_FILE,
44
+ ALL_REQUIRED_DATA=config.ALL_REQUIRED_DATA,
45
+ ALL_REQUIRED_FILES=config.ALL_REQUIRED_FILES,
46
+ )
47
+
48
+ return data_validation_config
49
+
50
+ def get_data_transformation_config(self) -> DataTransformationConfig:
51
+ config = self.config.data_transformation
52
+
53
+ create_directories([config.root_dir])
54
+
55
+ data_transformation_config = DataTransformationConfig(
56
+ root_dir=config.root_dir,
57
+ data_path=config.data_path,
58
+ ALL_REQUIRED_DATA=config.ALL_REQUIRED_DATA,
59
+ tokenizer_name=config.tokenizer_name
60
+ )
61
+
62
+ return data_transformation_config
63
+
64
+ def get_model_trainer_config(self) -> ModelTrainerConfig:
65
+ config = self.config.model_trainer
66
+ params = self.params.TrainingArguments
67
+
68
+ create_directories([config.root_dir])
69
+
70
+ model_trainer_config = ModelTrainerConfig(
71
+ root_dir=config.root_dir,
72
+ data_path=config.data_path,
73
+ model_ckpt=config.model_ckpt,
74
+ num_train_epochs=params.num_train_epochs,
75
+ warmup_steps=params.warmup_steps,
76
+ per_device_train_batch_size=params.per_device_train_batch_size,
77
+ weight_decay=params.weight_decay,
78
+ logging_steps=params.logging_steps,
79
+ evaluation_strategy=params.evaluation_strategy,
80
+ eval_steps=params.eval_steps,
81
+ save_steps=params.save_steps,
82
+ gradient_accumulation_steps=params.gradient_accumulation_steps,
83
+ )
84
+
85
+ return model_trainer_config
86
+
87
+ def get_model_evaluation_config(self) -> ModelEvaluationConfig:
88
+ config = self.config.model_evaluation
89
+
90
+ create_directories([config.root_dir])
91
+
92
+ model_evaluation_config = ModelEvaluationConfig(
93
+ root_dir=config.root_dir,
94
+ data_path=config.data_path,
95
+ model_path=config.model_path,
96
+ tokenizer_path=config.tokenizer_path,
97
+ metric_file_name=config.metric_file_name,
98
+ )
99
+
100
+ return model_evaluation_config
src/summarylm/config/gcloud_syncer.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ class GCloudSync:
4
+
5
+ def sync_folder_to_gcloud(self, gcp_bucket_url, filepath, filename):
6
+
7
+ """
8
+ Function to sync files from local machine to Google Cloud Storage
9
+
10
+ Args:
11
+ gcp_bucket_url (str): Google Cloud Storage bucket URL
12
+ filepath (str): Local file path
13
+ filename (str): Local file name
14
+
15
+ """
16
+
17
+ command = f"gsutil cp {filename}/{filepath} gs://{gcp_bucket_url}"
18
+
19
+ os.system(command)
20
+
21
+ def sync_folder_from_gcloud(self, gcp_bucket_url, filename, destination):
22
+
23
+ """
24
+ Function to sync the folder from the gclooud to local machine
25
+
26
+ Args:
27
+ gcp_bucket_url (str): Google Cloud Storage bucket URL
28
+ filename (str): Local file name
29
+ destination (str): Local file path
30
+ """
31
+
32
+ command = f"gsutil cp gs://{gcp_bucket_url}/{filename} {destination}/{filename}"
33
+
34
+ os.system(command)
src/summarylm/constants/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ CONFIG_FILE_PATH = Path("config/config.yaml")
4
+ PARAMS_FILE_PATH = Path("params.yaml")
src/summarylm/entity/__init__.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from pathlib import Path
3
+
4
+ @dataclass(frozen=True)
5
+ class DataIngestionConfig:
6
+ root_dir: Path
7
+ ALL_HUGGINGFACE_DATA: list
8
+ LOCAL_DATA_FILE: list
9
+
10
+ @dataclass(frozen=True)
11
+ class DataValidationConfig:
12
+ root_dir: Path
13
+ STATUS_FILE: str
14
+ ALL_REQUIRED_DATA: list
15
+ ALL_REQUIRED_FILES: list
16
+
17
+ @dataclass(frozen=True)
18
+ class DataTransformationConfig:
19
+ root_dir: Path
20
+ data_path: Path
21
+ ALL_REQUIRED_DATA: Path
22
+ tokenizer_name: Path
23
+
24
+ @dataclass(frozen=True)
25
+ class ModelTrainerConfig:
26
+ root_dir: Path
27
+ data_path: Path
28
+ model_ckpt: Path
29
+ num_train_epochs: int
30
+ warmup_steps: int
31
+ per_device_train_batch_size: int
32
+ weight_decay: float
33
+ logging_steps: int
34
+ evaluation_strategy: str
35
+ eval_steps: int
36
+ save_steps: float
37
+ gradient_accumulation_steps: int
38
+
39
+ @dataclass(frozen=True)
40
+ class ModelEvaluationConfig:
41
+ root_dir: Path
42
+ data_path: Path
43
+ model_path: Path
44
+ tokenizer_path: Path
45
+ metric_file_name: Path
src/summarylm/exception/__init__.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from summarylm.logging import logger
4
+
5
+ def error_message_detail(error, error_detail):
6
+ """
7
+ Retruns the error message and error details and logs the error
8
+
9
+ Args:
10
+ error: error message
11
+ error_detail: error details
12
+
13
+ Returns:
14
+ error_message: error message
15
+ """
16
+ _, _, exe_tb = error_detail.exc_info()
17
+ file_name = exe_tb.tb_frame.f_code.co_filename
18
+ line_number = exe_tb.tb_lineno
19
+ error_message = "Error occured in file called [{0}] line number: [{1}] error message: [{2}]".format(
20
+ file_name, line_number, str(error)
21
+ )
22
+
23
+ logger.info(error_message)
24
+
25
+ return error_message
26
+
27
+ class CustomException(Exception):
28
+ def __init__(self, error_message, error_detail):
29
+ super().__init__(error_message)
30
+ self.error_message = error_message_detail(error_message, error_detail=error_detail)
31
+
32
+ def __str__(self):
33
+ return self.error_message
34
+
src/summarylm/logging/__init__.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from datetime import datetime
4
+
5
+ """
6
+ Logging Every error and in logging file that is in the logs directory.
7
+ """
8
+
9
+ LOG_FILE = f"running_logs.log"
10
+ logs_path = os.path.join(os.getcwd(), "logs")
11
+ os.makedirs(logs_path, exist_ok=True)
12
+
13
+
14
+ LOG_FILE_PATH = os.path.join(logs_path, LOG_FILE)
15
+
16
+ logging.basicConfig(
17
+ filename=LOG_FILE_PATH,
18
+ format="[ %(asctime)s ] %(lineno)d %(name)s - %(levelname)s - %(message)s",
19
+ level=logging.INFO,
20
+ )
21
+
22
+ logger = logging.getLogger("textSummarizerLogger")
src/summarylm/pipeline/__init__.py ADDED
File without changes
src/summarylm/pipeline/data_ingestion.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from summarylm.config.configuration import ConfigurationManager
3
+ from summarylm.components.data_ingestion import DataIngestion
4
+ from summarylm.logging import logger
5
+ from summarylm.exception import CustomException
6
+
7
+
8
+ class DataIngestionPipeline:
9
+ """
10
+ Pipeline for data ingestion
11
+ """
12
+ def __init__(self) -> None:
13
+ pass
14
+
15
+ def main(self):
16
+ try:
17
+ config = ConfigurationManager()
18
+ data_ingestion_config = config.get_data_ingestion_config()
19
+ data_ingestion = DataIngestion(config=data_ingestion_config)
20
+ data_ingestion.download_data()
21
+ except Exception as e:
22
+ raise CustomException(e, sys) from e
src/summarylm/pipeline/data_transformation.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from summarylm.config.configuration import ConfigurationManager
3
+ from summarylm.components.data_transformation import DataTransformation
4
+ from summarylm.logging import logger
5
+ from summarylm.exception import CustomException
6
+
7
+
8
+ class DataTransformationPipeline:
9
+ """
10
+ Pipeline for data transformation to convert data into right format
11
+ """
12
+ def __init__(self) -> None:
13
+ pass
14
+
15
+ def main(self):
16
+ try:
17
+ config = ConfigurationManager()
18
+ data_transformation_config = config.get_data_transformation_config()
19
+ data_transformation = DataTransformation(config=data_transformation_config)
20
+ data_transformation.convert()
21
+ except Exception as e:
22
+ raise CustomException(e, sys) from e
src/summarylm/pipeline/data_validation.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from summarylm.config.configuration import ConfigurationManager
3
+ from summarylm.components.data_validation import DataValidation
4
+ from summarylm.logging import logger
5
+ from summarylm.exception import CustomException
6
+
7
+
8
+ class DataValidationPipeline:
9
+ """
10
+ Pipeline for validating if data exists
11
+ """
12
+ def __init__(self) -> None:
13
+ pass
14
+
15
+ def main(self):
16
+ try:
17
+ config = ConfigurationManager()
18
+ data_validation_config = config.get_data_validation_config()
19
+ data_validation = DataValidation(config=data_validation_config)
20
+ data_validation.validate_all_files_exist()
21
+ except Exception as e:
22
+ raise CustomException(e, sys) from e
src/summarylm/pipeline/model_evaluation.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from summarylm.config.configuration import ConfigurationManager
3
+ from summarylm.components.model_evaluation import ModelEvaluation
4
+ from summarylm.logging import logger
5
+ from summarylm.exception import CustomException
6
+
7
+
8
+ class ModelEvaluationPipeline:
9
+ """
10
+ Pipeline for pegasus model evaluation
11
+ """
12
+ def __init__(self) -> None:
13
+ pass
14
+
15
+ def main(self):
16
+ try:
17
+ config = ConfigurationManager()
18
+ model_evaluation_config = config.get_model_evaluation_config()
19
+ model_evaluation_config = ModelEvaluation(config=model_evaluation_config)
20
+ model_evaluation_config.evaluation()
21
+ except Exception as e:
22
+ raise CustomException(e, sys) from e
src/summarylm/pipeline/model_trainer.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from summarylm.config.configuration import ConfigurationManager
3
+ from summarylm.components.model_trainer import ModelTrainer
4
+ from summarylm.logging import logger
5
+ from summarylm.exception import CustomException
6
+
7
+
8
+ class ModelTrainerPipeline:
9
+ """
10
+ Pipeline for training pegasus model
11
+ """
12
+ def __init__(self) -> None:
13
+ pass
14
+
15
+ def main(self):
16
+ try:
17
+ config = ConfigurationManager()
18
+ model_trainer_config = config.get_model_trainer_config()
19
+ model_trainer_config = ModelTrainer(config=model_trainer_config)
20
+ model_trainer_config.train()
21
+ except Exception as e:
22
+ raise CustomException(e, sys) from e
src/summarylm/pipeline/prediction.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from summarylm.config.configuration import ConfigurationManager
2
+ from transformers import AutoTokenizer
3
+ from transformers import pipeline
4
+
5
+ class PredictionPipeline:
6
+ def __init__(self):
7
+ self.config = ConfigurationManager().get_model_evaluation_config()
8
+
9
+ def predict(self, text, max_length):
10
+ tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
11
+ gen_kwargs = {"length_penalty": 0.8, "num_beams": 8, "max_length": max_length}
12
+
13
+ pipe = pipeline("summarization", model=self.config.model_path, tokenizer=tokenizer)
14
+
15
+ output = pipe(text, **gen_kwargs)[0]["summary_text"]
16
+
17
+ return output
src/summarylm/utils/__init__.py ADDED
File without changes
src/summarylm/utils/common.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from box.exceptions import BoxValueError
4
+ from summarylm.exception import CustomException
5
+ import yaml
6
+ from summarylm.logging import logger
7
+ from ensure import ensure_annotations
8
+ from box import ConfigBox
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+
13
+ @ensure_annotations
14
+ def read_yaml(path_to_yaml: Path) -> ConfigBox:
15
+ """
16
+ Reading yam file and returns
17
+
18
+ Args:
19
+ path_to_yaml (str): path like input
20
+
21
+ Raises:
22
+ ValueError: if yaml file is empty
23
+
24
+ Returns:
25
+ ConfigBox: ConfigBox type
26
+ """
27
+
28
+ try:
29
+ with open(path_to_yaml) as yaml_file:
30
+ content = yaml.safe_load(yaml_file)
31
+ logger.info(f"Yaml file: {path_to_yaml} loaded successfully")
32
+ return ConfigBox(content)
33
+ except BoxValueError:
34
+ raise ValueError("yaml file is empty")
35
+ except Exception as e:
36
+ raise CustomException(e, sys) from e
37
+
38
+ @ensure_annotations
39
+ def create_directories(path_to_directories: list, verbose=True):
40
+ """
41
+ Create list of directories
42
+
43
+ Args:
44
+ path_to_directories (list): list of path of directories
45
+ ignore_log (bool, optional): ignore if multiple dirs is to be created. Defaults to False
46
+ """
47
+
48
+ for path in path_to_directories:
49
+ os.makedirs(path, exist_ok=True)
50
+ if verbose:
51
+ logger.info(f"Directory created successfully at: {path}")
52
+
53
+ @ensure_annotations
54
+ def get_size(path: Path) -> str:
55
+ """
56
+ Get size in KB
57
+
58
+ Args:
59
+ path (Path): path of the file
60
+
61
+ Returns:
62
+ str: size in KB
63
+ """
64
+ size_in_kb = round(os.path.getsize(path)/1024)
65
+ return f"~{size_in_kb} KB"
template.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ import logging
4
+
5
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
6
+
7
+ project_name = "summarylm"
8
+
9
+ list_of_file = [
10
+ ".github/workflows/.gitkeep",
11
+ f"src/{project_name}/__init__.py",
12
+ f"src/{project_name}/components/__init__.py",
13
+ f"src/{project_name}/components/data_ingestion.py",
14
+ f"src/{project_name}/components/data_transformation.py",
15
+ f"src/{project_name}/components/data_validation.py",
16
+ f"src/{project_name}/components/model_evaluation.py",
17
+ f"src/{project_name}/components/model_trainer.py",
18
+ f"src/{project_name}/utils/__init__.py",
19
+ f"src/{project_name}/utils/common.py",
20
+ f"src/{project_name}/logging/__init__.py",
21
+ f"src/{project_name}/exception/__init__.py",
22
+ f"src/{project_name}/config/__init__.py",
23
+ f"src/{project_name}/config/configuration.py",
24
+ f"src/{project_name}/config/gcloud_syncer.py",
25
+ f"src/{project_name}/pipeline/__init__.py",
26
+ f"src/{project_name}/pipeline/data_ingestion.py",
27
+ f"src/{project_name}/entity/__init__.py",
28
+ f"src/{project_name}/constants/__init__.py",
29
+ "config/config.yaml",
30
+ "params.yaml",
31
+ "app.py",
32
+ "main.py",
33
+ "Dockerfile",
34
+ "requirements.txt",
35
+ "setup.py",
36
+ "research/experiment.ipynb",
37
+ ]
38
+
39
+ for filepath in list_of_file:
40
+ filepath = Path(filepath)
41
+ filedir, filename = os.path.split(filepath)
42
+
43
+ if filedir != "":
44
+ os.makedirs(filedir, exist_ok=True)
45
+ logging.info(f"Creating directory: {filedir} for the file {filename}")
46
+
47
+
48
+ if (not os.path.exists(filepath)) or (os.path.getsize(filepath) == 0):
49
+ with open(filepath, 'w') as f:
50
+ pass
51
+ logging.info(f"Creating empty file: {filepath}")
52
+ else:
53
+ logging.info(f"{filename} is already exists")