Spaces:
Runtime error
Runtime error
| from tqdm.auto import tqdm | |
| from datasets import load_dataset | |
| from concurrent.futures import ThreadPoolExecutor | |
| from functools import partial | |
| import io | |
| import urllib | |
| import PIL.Image | |
| from datasets import load_dataset | |
| from datasets.utils.file_utils import get_datasets_user_agent | |
| import os | |
| os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" | |
| dataset = load_dataset("biglam/berlin_state_library_ocr") | |
| file_names = dataset['train']['file name'] | |
| ppns = dataset['train']['ppn'] | |
| assert len(file_names)==len(ppns) | |
| def create_url(filename, PPN): | |
| page = filename.split(".")[0] | |
| # PPN = row['ppn'] | |
| return f"https://content.staatsbibliothek-berlin.de/dc/PPN{PPN}-{page}/full/full/0/default.jpg" | |
| urls = [] | |
| for f, p in tqdm(zip(file_names, ppns)): | |
| url = create_url(f,p) | |
| urls.append(url) | |
| len(urls) | |
| dataset = dataset['train'].add_column("url",urls) | |
| dataset = dataset.select(range(100_000)) | |
| USER_AGENT = get_datasets_user_agent() | |
| def fetch_single_image(image_url, timeout=None, retries=0): | |
| for _ in range(retries + 1): | |
| try: | |
| request = urllib.request.Request( | |
| image_url, | |
| data=None, | |
| headers={"user-agent": USER_AGENT}, | |
| ) | |
| with urllib.request.urlopen(request, timeout=timeout) as req: | |
| image = PIL.Image.open(io.BytesIO(req.read())) | |
| break | |
| except Exception: | |
| image = None | |
| return image | |
| def fetch_images(batch, num_threads, timeout=None, retries=0): | |
| fetch_single_image_with_args = partial(fetch_single_image, timeout=timeout, retries=retries) | |
| with ThreadPoolExecutor(max_workers=num_threads) as executor: | |
| batch["image"] = list(executor.map(fetch_single_image_with_args, batch["url"])) | |
| return batch | |
| num_threads = 20 | |
| dset = dataset.map(fetch_images, batched=True, batch_size=100, fn_kwargs={"num_threads": num_threads}) | |
| dset.push_to_hub('davanstrien/berlin_state_library_ocr_with_images') | |