Spaces:
Running
on
Zero
Running
on
Zero
| """ | |
| Parts of the code is based on source code of memit | |
| MIT License | |
| Copyright (c) 2022 Kevin Meng | |
| Permission is hereby granted, free of charge, to any person obtaining a copy | |
| of this software and associated documentation files (the "Software"), to deal | |
| in the Software without restriction, including without limitation the rights | |
| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
| copies of the Software, and to permit persons to whom the Software is | |
| furnished to do so, subject to the following conditions: | |
| The above copyright notice and this permission notice shall be included in all | |
| copies or substantial portions of the Software. | |
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
| SOFTWARE. | |
| """ | |
| import json | |
| from pathlib import Path | |
| import torch | |
| from transformers import AutoTokenizer | |
| REMOTE_ROOT_URL = "https://rome.baulab.info" | |
| REMOTE_URL = f"{REMOTE_ROOT_URL}/data/dsets/zsre_mend_eval.json" | |
| class MENDQADataset: | |
| """ | |
| Dataset of factual knowledge based on zsRE. | |
| Specifically selected from the QA validation slice from Mitchell et al. | |
| Project page: http://nlp.cs.washington.edu/zeroshot/ | |
| """ | |
| def __init__(self, data_dir: str, tok: AutoTokenizer, size=None, *args, **kwargs): | |
| data_dir = Path(data_dir) | |
| zsre_loc = data_dir / "zsre_mend_eval.json" | |
| if not zsre_loc.exists(): | |
| print(f"{zsre_loc} does not exist. Downloading from {REMOTE_URL}") | |
| data_dir.mkdir(exist_ok=True, parents=True) | |
| torch.hub.download_url_to_file(REMOTE_URL, zsre_loc) | |
| with open(zsre_loc, "r") as f: | |
| raw = json.load(f) | |
| data = [] | |
| for i, record in enumerate(raw): | |
| assert ( | |
| "nq question: " in record["loc"] | |
| ), f"Neighborhood prompt missing `nq question:`. Check for errors?" | |
| ans_toks = tok(" " + record["loc_ans"])["input_ids"] | |
| data.append( | |
| { | |
| "case_id": i, | |
| "requested_rewrite": { | |
| "prompt": record["src"].replace(record["subject"], "{}"), | |
| "subject": record["subject"], | |
| "target_new": {"str": record["answers"][0]}, | |
| "target_true": {"str": "<|endoftext|>"}, | |
| }, | |
| "paraphrase_prompts": [record["rephrase"]], | |
| "neighborhood_prompts": [ | |
| { | |
| "prompt": record["loc"] + "?" + tok.decode(ans_toks[:i]), | |
| "target": tok.decode(ans_toks[i]), | |
| } | |
| for i in range(len(ans_toks)) | |
| ], | |
| "attribute_prompts": [], | |
| "generation_prompts": [], | |
| } | |
| ) | |
| self._data = data[:size] | |
| def __getitem__(self, item): | |
| return self._data[item] | |
| def __len__(self): | |
| return len(self._data) | |