Commit
·
5476d32
0
Parent(s):
Duplicate from philschmid/donut-base-finetuned-cord-v2
Browse filesCo-authored-by: Philipp Schmid <[email protected]>
- .gitattributes +32 -0
- README.md +80 -0
- added_tokens.json +60 -0
- config.json +187 -0
- create_handler.ipynb +167 -0
- handler.py +46 -0
- preprocessor_config.json +24 -0
- pytorch_model.bin +3 -0
- res.png +0 -0
- sample.png +0 -0
- sentencepiece.bpe.model +3 -0
- special_tokens_map.json +18 -0
- tokenizer.json +0 -0
- tokenizer_config.json +22 -0
.gitattributes
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
tags:
|
| 4 |
+
- donut
|
| 5 |
+
- image-to-text
|
| 6 |
+
- vision
|
| 7 |
+
- endpoints-template
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
# Fork of [naver-clova-ix/donut-base-finetuned-cord-v2](https://huggingface.co/naver-clova-ix/donut-base-finetuned-cord-v2)
|
| 11 |
+
|
| 12 |
+
> This is fork of [naver-clova-ix/donut-base-finetuned-cord-v2](https://huggingface.co/naver-clova-ix/donut-base-finetuned-cord-v2) implementing a custom `handler.py` as an example for how to use `donut` models with [inference-endpoints](https://hf.co/inference-endpoints)
|
| 13 |
+
|
| 14 |
+
---
|
| 15 |
+
|
| 16 |
+
# Donut (base-sized model, fine-tuned on CORD)
|
| 17 |
+
|
| 18 |
+
Donut model fine-tuned on CORD. It was introduced in the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewok et al. and first released in [this repository](https://github.com/clovaai/donut).
|
| 19 |
+
|
| 20 |
+
Donut consists of a vision encoder (Swin Transformer) and a text decoder (BART). Given an image, the encoder first encodes the image into a tensor of embeddings (of shape batch_size, seq_len, hidden_size), after which the decoder autoregressively generates text, conditioned on the encoding of the encoder.
|
| 21 |
+
|
| 22 |
+
# Use with Inference Endpoints
|
| 23 |
+
|
| 24 |
+
Hugging Face Inference endpoints can directly work with binary data, this means that we can directly send our image from our document to the endpoint. We are going to use requests to send our requests. (make your you have it installed `pip install requests`)
|
| 25 |
+
|
| 26 |
+

|
| 27 |
+
|
| 28 |
+
## Send requests with Pyton
|
| 29 |
+
|
| 30 |
+
load sample image
|
| 31 |
+
|
| 32 |
+
```bash
|
| 33 |
+
wget https://huggingface.co/philschmid/donut-base-finetuned-cord-v2/resolve/main/sample.png
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
send request to endpoint
|
| 37 |
+
|
| 38 |
+
```python
|
| 39 |
+
import json
|
| 40 |
+
import requests as r
|
| 41 |
+
import mimetypes
|
| 42 |
+
|
| 43 |
+
ENDPOINT_URL="" # url of your endpoint
|
| 44 |
+
HF_TOKEN="" # organization token where you deployed your endpoint
|
| 45 |
+
|
| 46 |
+
def predict(path_to_image:str=None):
|
| 47 |
+
with open(path_to_image, "rb") as i:
|
| 48 |
+
b = i.read()
|
| 49 |
+
headers= {
|
| 50 |
+
"Authorization": f"Bearer {HF_TOKEN}",
|
| 51 |
+
"Content-Type": mimetypes.guess_type(path_to_image)[0]
|
| 52 |
+
}
|
| 53 |
+
response = r.post(ENDPOINT_URL, headers=headers, data=b)
|
| 54 |
+
return response.json()
|
| 55 |
+
|
| 56 |
+
prediction = predict(path_to_image="sample.png")
|
| 57 |
+
|
| 58 |
+
print(prediction)
|
| 59 |
+
# {'menu': [{'nm': '0571-1854 BLUS WANITA',
|
| 60 |
+
# 'unitprice': '@120.000',
|
| 61 |
+
# 'cnt': '1',
|
| 62 |
+
# 'price': '120,000'},
|
| 63 |
+
# {'nm': '1002-0060 SHOPPING BAG', 'cnt': '1', 'price': '0'}],
|
| 64 |
+
# 'total': {'total_price': '120,000',
|
| 65 |
+
# 'changeprice': '0',
|
| 66 |
+
# 'creditcardprice': '120,000',
|
| 67 |
+
# 'menuqty_cnt': '1'}}
|
| 68 |
+
```
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
**curl example**
|
| 73 |
+
|
| 74 |
+
```bash
|
| 75 |
+
curl https://ak7gduay2ypyr9vp.us-east-1.aws.endpoints.huggingface.cloud \
|
| 76 |
+
-X POST \
|
| 77 |
+
--data-binary 'sample.png' \
|
| 78 |
+
-H "Authorization: Bearer XXX" \
|
| 79 |
+
-H "Content-Type: null"
|
| 80 |
+
```
|
added_tokens.json
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"</s_cashprice>": 57549,
|
| 3 |
+
"</s_changeprice>": 57551,
|
| 4 |
+
"</s_cnt>": 57529,
|
| 5 |
+
"</s_creditcardprice>": 57563,
|
| 6 |
+
"</s_discount_price>": 57557,
|
| 7 |
+
"</s_discountprice>": 57567,
|
| 8 |
+
"</s_emoneyprice>": 57569,
|
| 9 |
+
"</s_etc>": 57541,
|
| 10 |
+
"</s_itemsubtotal>": 57577,
|
| 11 |
+
"</s_menu>": 57525,
|
| 12 |
+
"</s_menuqty_cnt>": 57555,
|
| 13 |
+
"</s_menutype_cnt>": 57553,
|
| 14 |
+
"</s_nm>": 57527,
|
| 15 |
+
"</s_num>": 57565,
|
| 16 |
+
"</s_othersvc_price>": 57573,
|
| 17 |
+
"</s_price>": 57531,
|
| 18 |
+
"</s_service_price>": 57537,
|
| 19 |
+
"</s_sub>": 57547,
|
| 20 |
+
"</s_sub_total>": 57533,
|
| 21 |
+
"</s_subtotal_price>": 57535,
|
| 22 |
+
"</s_tax_price>": 57539,
|
| 23 |
+
"</s_total>": 57543,
|
| 24 |
+
"</s_total_etc>": 57561,
|
| 25 |
+
"</s_total_price>": 57545,
|
| 26 |
+
"</s_unitprice>": 57559,
|
| 27 |
+
"</s_vatyn>": 57575,
|
| 28 |
+
"</s_void_menu>": 57571,
|
| 29 |
+
"<s_cashprice>": 57550,
|
| 30 |
+
"<s_changeprice>": 57552,
|
| 31 |
+
"<s_cnt>": 57530,
|
| 32 |
+
"<s_cord-v2>": 57579,
|
| 33 |
+
"<s_creditcardprice>": 57564,
|
| 34 |
+
"<s_discount_price>": 57558,
|
| 35 |
+
"<s_discountprice>": 57568,
|
| 36 |
+
"<s_emoneyprice>": 57570,
|
| 37 |
+
"<s_etc>": 57542,
|
| 38 |
+
"<s_iitcdip>": 57523,
|
| 39 |
+
"<s_itemsubtotal>": 57578,
|
| 40 |
+
"<s_menu>": 57526,
|
| 41 |
+
"<s_menuqty_cnt>": 57556,
|
| 42 |
+
"<s_menutype_cnt>": 57554,
|
| 43 |
+
"<s_nm>": 57528,
|
| 44 |
+
"<s_num>": 57566,
|
| 45 |
+
"<s_othersvc_price>": 57574,
|
| 46 |
+
"<s_price>": 57532,
|
| 47 |
+
"<s_service_price>": 57538,
|
| 48 |
+
"<s_sub>": 57548,
|
| 49 |
+
"<s_sub_total>": 57534,
|
| 50 |
+
"<s_subtotal_price>": 57536,
|
| 51 |
+
"<s_synthdog>": 57524,
|
| 52 |
+
"<s_tax_price>": 57540,
|
| 53 |
+
"<s_total>": 57544,
|
| 54 |
+
"<s_total_etc>": 57562,
|
| 55 |
+
"<s_total_price>": 57546,
|
| 56 |
+
"<s_unitprice>": 57560,
|
| 57 |
+
"<s_vatyn>": 57576,
|
| 58 |
+
"<s_void_menu>": 57572,
|
| 59 |
+
"<sep/>": 57522
|
| 60 |
+
}
|
config.json
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"VisionEncoderDecoderModel"
|
| 4 |
+
],
|
| 5 |
+
"decoder": {
|
| 6 |
+
"_name_or_path": "",
|
| 7 |
+
"activation_dropout": 0.0,
|
| 8 |
+
"activation_function": "gelu",
|
| 9 |
+
"add_cross_attention": true,
|
| 10 |
+
"add_final_layer_norm": true,
|
| 11 |
+
"architectures": null,
|
| 12 |
+
"attention_dropout": 0.0,
|
| 13 |
+
"bad_words_ids": null,
|
| 14 |
+
"bos_token_id": 0,
|
| 15 |
+
"chunk_size_feed_forward": 0,
|
| 16 |
+
"classifier_dropout": 0.0,
|
| 17 |
+
"cross_attention_hidden_size": null,
|
| 18 |
+
"d_model": 1024,
|
| 19 |
+
"decoder_attention_heads": 16,
|
| 20 |
+
"decoder_ffn_dim": 4096,
|
| 21 |
+
"decoder_layerdrop": 0.0,
|
| 22 |
+
"decoder_layers": 4,
|
| 23 |
+
"decoder_start_token_id": null,
|
| 24 |
+
"diversity_penalty": 0.0,
|
| 25 |
+
"do_sample": false,
|
| 26 |
+
"dropout": 0.1,
|
| 27 |
+
"early_stopping": false,
|
| 28 |
+
"encoder_attention_heads": 16,
|
| 29 |
+
"encoder_ffn_dim": 4096,
|
| 30 |
+
"encoder_layerdrop": 0.0,
|
| 31 |
+
"encoder_layers": 12,
|
| 32 |
+
"encoder_no_repeat_ngram_size": 0,
|
| 33 |
+
"eos_token_id": 2,
|
| 34 |
+
"exponential_decay_length_penalty": null,
|
| 35 |
+
"finetuning_task": null,
|
| 36 |
+
"forced_bos_token_id": null,
|
| 37 |
+
"forced_eos_token_id": 2,
|
| 38 |
+
"id2label": {
|
| 39 |
+
"0": "LABEL_0",
|
| 40 |
+
"1": "LABEL_1"
|
| 41 |
+
},
|
| 42 |
+
"init_std": 0.02,
|
| 43 |
+
"is_decoder": true,
|
| 44 |
+
"is_encoder_decoder": false,
|
| 45 |
+
"label2id": {
|
| 46 |
+
"LABEL_0": 0,
|
| 47 |
+
"LABEL_1": 1
|
| 48 |
+
},
|
| 49 |
+
"length_penalty": 1.0,
|
| 50 |
+
"max_length": 20,
|
| 51 |
+
"max_position_embeddings": 768,
|
| 52 |
+
"min_length": 0,
|
| 53 |
+
"model_type": "mbart",
|
| 54 |
+
"no_repeat_ngram_size": 0,
|
| 55 |
+
"num_beam_groups": 1,
|
| 56 |
+
"num_beams": 1,
|
| 57 |
+
"num_hidden_layers": 12,
|
| 58 |
+
"num_return_sequences": 1,
|
| 59 |
+
"output_attentions": false,
|
| 60 |
+
"output_hidden_states": false,
|
| 61 |
+
"output_scores": false,
|
| 62 |
+
"pad_token_id": 1,
|
| 63 |
+
"prefix": null,
|
| 64 |
+
"problem_type": null,
|
| 65 |
+
"pruned_heads": {},
|
| 66 |
+
"remove_invalid_values": false,
|
| 67 |
+
"repetition_penalty": 1.0,
|
| 68 |
+
"return_dict": true,
|
| 69 |
+
"return_dict_in_generate": false,
|
| 70 |
+
"scale_embedding": true,
|
| 71 |
+
"sep_token_id": null,
|
| 72 |
+
"task_specific_params": null,
|
| 73 |
+
"temperature": 1.0,
|
| 74 |
+
"tf_legacy_loss": false,
|
| 75 |
+
"tie_encoder_decoder": false,
|
| 76 |
+
"tie_word_embeddings": true,
|
| 77 |
+
"tokenizer_class": null,
|
| 78 |
+
"top_k": 50,
|
| 79 |
+
"top_p": 1.0,
|
| 80 |
+
"torch_dtype": null,
|
| 81 |
+
"torchscript": false,
|
| 82 |
+
"transformers_version": "4.22.0.dev0",
|
| 83 |
+
"typical_p": 1.0,
|
| 84 |
+
"use_bfloat16": false,
|
| 85 |
+
"use_cache": true,
|
| 86 |
+
"vocab_size": 57580
|
| 87 |
+
},
|
| 88 |
+
"encoder": {
|
| 89 |
+
"_name_or_path": "",
|
| 90 |
+
"add_cross_attention": false,
|
| 91 |
+
"architectures": null,
|
| 92 |
+
"attention_probs_dropout_prob": 0.0,
|
| 93 |
+
"bad_words_ids": null,
|
| 94 |
+
"bos_token_id": null,
|
| 95 |
+
"chunk_size_feed_forward": 0,
|
| 96 |
+
"cross_attention_hidden_size": null,
|
| 97 |
+
"decoder_start_token_id": null,
|
| 98 |
+
"depths": [
|
| 99 |
+
2,
|
| 100 |
+
2,
|
| 101 |
+
14,
|
| 102 |
+
2
|
| 103 |
+
],
|
| 104 |
+
"diversity_penalty": 0.0,
|
| 105 |
+
"do_sample": false,
|
| 106 |
+
"drop_path_rate": 0.1,
|
| 107 |
+
"early_stopping": false,
|
| 108 |
+
"embed_dim": 128,
|
| 109 |
+
"encoder_no_repeat_ngram_size": 0,
|
| 110 |
+
"eos_token_id": null,
|
| 111 |
+
"exponential_decay_length_penalty": null,
|
| 112 |
+
"finetuning_task": null,
|
| 113 |
+
"forced_bos_token_id": null,
|
| 114 |
+
"forced_eos_token_id": null,
|
| 115 |
+
"hidden_act": "gelu",
|
| 116 |
+
"hidden_dropout_prob": 0.0,
|
| 117 |
+
"hidden_size": 1024,
|
| 118 |
+
"id2label": {
|
| 119 |
+
"0": "LABEL_0",
|
| 120 |
+
"1": "LABEL_1"
|
| 121 |
+
},
|
| 122 |
+
"image_size": [
|
| 123 |
+
1280,
|
| 124 |
+
960
|
| 125 |
+
],
|
| 126 |
+
"initializer_range": 0.02,
|
| 127 |
+
"is_decoder": false,
|
| 128 |
+
"is_encoder_decoder": false,
|
| 129 |
+
"label2id": {
|
| 130 |
+
"LABEL_0": 0,
|
| 131 |
+
"LABEL_1": 1
|
| 132 |
+
},
|
| 133 |
+
"layer_norm_eps": 1e-05,
|
| 134 |
+
"length_penalty": 1.0,
|
| 135 |
+
"max_length": 20,
|
| 136 |
+
"min_length": 0,
|
| 137 |
+
"mlp_ratio": 4.0,
|
| 138 |
+
"model_type": "donut-swin",
|
| 139 |
+
"no_repeat_ngram_size": 0,
|
| 140 |
+
"num_beam_groups": 1,
|
| 141 |
+
"num_beams": 1,
|
| 142 |
+
"num_channels": 3,
|
| 143 |
+
"num_heads": [
|
| 144 |
+
4,
|
| 145 |
+
8,
|
| 146 |
+
16,
|
| 147 |
+
32
|
| 148 |
+
],
|
| 149 |
+
"num_layers": 4,
|
| 150 |
+
"num_return_sequences": 1,
|
| 151 |
+
"output_attentions": false,
|
| 152 |
+
"output_hidden_states": false,
|
| 153 |
+
"output_scores": false,
|
| 154 |
+
"pad_token_id": null,
|
| 155 |
+
"patch_size": 4,
|
| 156 |
+
"path_norm": true,
|
| 157 |
+
"prefix": null,
|
| 158 |
+
"problem_type": null,
|
| 159 |
+
"pruned_heads": {},
|
| 160 |
+
"qkv_bias": true,
|
| 161 |
+
"remove_invalid_values": false,
|
| 162 |
+
"repetition_penalty": 1.0,
|
| 163 |
+
"return_dict": true,
|
| 164 |
+
"return_dict_in_generate": false,
|
| 165 |
+
"sep_token_id": null,
|
| 166 |
+
"task_specific_params": null,
|
| 167 |
+
"temperature": 1.0,
|
| 168 |
+
"tf_legacy_loss": false,
|
| 169 |
+
"tie_encoder_decoder": false,
|
| 170 |
+
"tie_word_embeddings": true,
|
| 171 |
+
"tokenizer_class": null,
|
| 172 |
+
"top_k": 50,
|
| 173 |
+
"top_p": 1.0,
|
| 174 |
+
"torch_dtype": null,
|
| 175 |
+
"torchscript": false,
|
| 176 |
+
"transformers_version": "4.22.0.dev0",
|
| 177 |
+
"typical_p": 1.0,
|
| 178 |
+
"use_absolute_embeddings": false,
|
| 179 |
+
"use_bfloat16": false,
|
| 180 |
+
"window_size": 10
|
| 181 |
+
},
|
| 182 |
+
"is_encoder_decoder": true,
|
| 183 |
+
"model_type": "vision-encoder-decoder",
|
| 184 |
+
"tie_word_embeddings": false,
|
| 185 |
+
"torch_dtype": "float32",
|
| 186 |
+
"transformers_version": null
|
| 187 |
+
}
|
create_handler.ipynb
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": null,
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"outputs": [],
|
| 8 |
+
"source": [
|
| 9 |
+
"!pip install transformers --upgrade"
|
| 10 |
+
]
|
| 11 |
+
},
|
| 12 |
+
{
|
| 13 |
+
"cell_type": "markdown",
|
| 14 |
+
"metadata": {},
|
| 15 |
+
"source": [
|
| 16 |
+
"## Create Custom Handler for Inference Endpoints\n"
|
| 17 |
+
]
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"cell_type": "code",
|
| 21 |
+
"execution_count": 17,
|
| 22 |
+
"metadata": {},
|
| 23 |
+
"outputs": [
|
| 24 |
+
{
|
| 25 |
+
"name": "stdout",
|
| 26 |
+
"output_type": "stream",
|
| 27 |
+
"text": [
|
| 28 |
+
"Overwriting handler.py\n"
|
| 29 |
+
]
|
| 30 |
+
}
|
| 31 |
+
],
|
| 32 |
+
"source": [
|
| 33 |
+
"%%writefile handler.py\n",
|
| 34 |
+
"from typing import Dict, List, Any\n",
|
| 35 |
+
"from transformers import DonutProcessor, VisionEncoderDecoderModel\n",
|
| 36 |
+
"import torch\n",
|
| 37 |
+
"\n",
|
| 38 |
+
"\n",
|
| 39 |
+
"# check for GPU\n",
|
| 40 |
+
"device = 0 if torch.cuda.is_available() else -1\n",
|
| 41 |
+
"\n",
|
| 42 |
+
"\n",
|
| 43 |
+
"class EndpointHandler:\n",
|
| 44 |
+
" def __init__(self, path=\"\"):\n",
|
| 45 |
+
" # load the model\n",
|
| 46 |
+
" self.processor = DonutProcessor.from_pretrained(path)\n",
|
| 47 |
+
" self.model = VisionEncoderDecoderModel.from_pretrained(path)\n",
|
| 48 |
+
" # move model to device\n",
|
| 49 |
+
" self.model.to(device)\n",
|
| 50 |
+
" self.decoder_input_ids = self.processor.tokenizer(\n",
|
| 51 |
+
" \"<s_cord-v2>\", add_special_tokens=False, return_tensors=\"pt\"\n",
|
| 52 |
+
" ).input_ids\n",
|
| 53 |
+
"\n",
|
| 54 |
+
" def __call__(self, data: Any) -> List[List[Dict[str, float]]]:\n",
|
| 55 |
+
"\n",
|
| 56 |
+
" inputs = data.pop(\"inputs\", data)\n",
|
| 57 |
+
"\n",
|
| 58 |
+
"\n",
|
| 59 |
+
" # preprocess the input\n",
|
| 60 |
+
" pixel_values = self.processor(inputs, return_tensors=\"pt\").pixel_values\n",
|
| 61 |
+
"\n",
|
| 62 |
+
" # forward pass\n",
|
| 63 |
+
" outputs = self.model.generate(\n",
|
| 64 |
+
" pixel_values.to(device),\n",
|
| 65 |
+
" decoder_input_ids=self.decoder_input_ids.to(device),\n",
|
| 66 |
+
" max_length=self.model.decoder.config.max_position_embeddings,\n",
|
| 67 |
+
" early_stopping=True,\n",
|
| 68 |
+
" pad_token_id=self.processor.tokenizer.pad_token_id,\n",
|
| 69 |
+
" eos_token_id=self.processor.tokenizer.eos_token_id,\n",
|
| 70 |
+
" use_cache=True,\n",
|
| 71 |
+
" num_beams=1,\n",
|
| 72 |
+
" bad_words_ids=[[self.processor.tokenizer.unk_token_id]],\n",
|
| 73 |
+
" return_dict_in_generate=True,\n",
|
| 74 |
+
" )\n",
|
| 75 |
+
" # process output\n",
|
| 76 |
+
" prediction = self.processor.batch_decode(outputs.sequences)[0]\n",
|
| 77 |
+
" prediction = self.processor.token2json(prediction)\n",
|
| 78 |
+
"\n",
|
| 79 |
+
" return prediction\n"
|
| 80 |
+
]
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"cell_type": "markdown",
|
| 84 |
+
"metadata": {},
|
| 85 |
+
"source": [
|
| 86 |
+
"test custom pipeline"
|
| 87 |
+
]
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"cell_type": "code",
|
| 91 |
+
"execution_count": 2,
|
| 92 |
+
"metadata": {},
|
| 93 |
+
"outputs": [],
|
| 94 |
+
"source": [
|
| 95 |
+
"from handler import EndpointHandler\n",
|
| 96 |
+
"\n",
|
| 97 |
+
"my_handler = EndpointHandler(\".\")"
|
| 98 |
+
]
|
| 99 |
+
},
|
| 100 |
+
{
|
| 101 |
+
"cell_type": "code",
|
| 102 |
+
"execution_count": 18,
|
| 103 |
+
"metadata": {},
|
| 104 |
+
"outputs": [
|
| 105 |
+
{
|
| 106 |
+
"data": {
|
| 107 |
+
"text/plain": [
|
| 108 |
+
"{'menu': [{'nm': '0571-1854 BLUS WANITA',\n",
|
| 109 |
+
" 'unitprice': '@120.000',\n",
|
| 110 |
+
" 'cnt': '1',\n",
|
| 111 |
+
" 'price': '120,000'},\n",
|
| 112 |
+
" {'nm': '1002-0060 SHOPPING BAG', 'cnt': '1', 'price': '0'}],\n",
|
| 113 |
+
" 'total': {'total_price': '120,000',\n",
|
| 114 |
+
" 'changeprice': '0',\n",
|
| 115 |
+
" 'creditcardprice': '120,000',\n",
|
| 116 |
+
" 'menuqty_cnt': '1'}}"
|
| 117 |
+
]
|
| 118 |
+
},
|
| 119 |
+
"execution_count": 18,
|
| 120 |
+
"metadata": {},
|
| 121 |
+
"output_type": "execute_result"
|
| 122 |
+
}
|
| 123 |
+
],
|
| 124 |
+
"source": [
|
| 125 |
+
"from PIL import Image\n",
|
| 126 |
+
"\n",
|
| 127 |
+
"payload = {\"inputs\": Image.open(\"sample.png\").convert(\"RGB\")}\n",
|
| 128 |
+
"\n",
|
| 129 |
+
"my_handler(payload)"
|
| 130 |
+
]
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"cell_type": "code",
|
| 134 |
+
"execution_count": null,
|
| 135 |
+
"metadata": {},
|
| 136 |
+
"outputs": [],
|
| 137 |
+
"source": []
|
| 138 |
+
}
|
| 139 |
+
],
|
| 140 |
+
"metadata": {
|
| 141 |
+
"kernelspec": {
|
| 142 |
+
"display_name": "Python 3.9.13 ('dev': conda)",
|
| 143 |
+
"language": "python",
|
| 144 |
+
"name": "python3"
|
| 145 |
+
},
|
| 146 |
+
"language_info": {
|
| 147 |
+
"codemirror_mode": {
|
| 148 |
+
"name": "ipython",
|
| 149 |
+
"version": 3
|
| 150 |
+
},
|
| 151 |
+
"file_extension": ".py",
|
| 152 |
+
"mimetype": "text/x-python",
|
| 153 |
+
"name": "python",
|
| 154 |
+
"nbconvert_exporter": "python",
|
| 155 |
+
"pygments_lexer": "ipython3",
|
| 156 |
+
"version": "3.9.13"
|
| 157 |
+
},
|
| 158 |
+
"orig_nbformat": 4,
|
| 159 |
+
"vscode": {
|
| 160 |
+
"interpreter": {
|
| 161 |
+
"hash": "f6dd96c16031089903d5a31ec148b80aeb0d39c32affb1a1080393235fbfa2fc"
|
| 162 |
+
}
|
| 163 |
+
}
|
| 164 |
+
},
|
| 165 |
+
"nbformat": 4,
|
| 166 |
+
"nbformat_minor": 2
|
| 167 |
+
}
|
handler.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict, List, Any
|
| 2 |
+
from transformers import DonutProcessor, VisionEncoderDecoderModel
|
| 3 |
+
import torch
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
# check for GPU
|
| 7 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class EndpointHandler:
|
| 11 |
+
def __init__(self, path=""):
|
| 12 |
+
# load the model
|
| 13 |
+
self.processor = DonutProcessor.from_pretrained(path)
|
| 14 |
+
self.model = VisionEncoderDecoderModel.from_pretrained(path)
|
| 15 |
+
# move model to device
|
| 16 |
+
self.model.to(device)
|
| 17 |
+
self.decoder_input_ids = self.processor.tokenizer(
|
| 18 |
+
"<s_cord-v2>", add_special_tokens=False, return_tensors="pt"
|
| 19 |
+
).input_ids
|
| 20 |
+
|
| 21 |
+
def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
|
| 22 |
+
|
| 23 |
+
inputs = data.pop("inputs", data)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
# preprocess the input
|
| 27 |
+
pixel_values = self.processor(inputs, return_tensors="pt").pixel_values
|
| 28 |
+
|
| 29 |
+
# forward pass
|
| 30 |
+
outputs = self.model.generate(
|
| 31 |
+
pixel_values.to(device),
|
| 32 |
+
decoder_input_ids=self.decoder_input_ids.to(device),
|
| 33 |
+
max_length=self.model.decoder.config.max_position_embeddings,
|
| 34 |
+
early_stopping=True,
|
| 35 |
+
pad_token_id=self.processor.tokenizer.pad_token_id,
|
| 36 |
+
eos_token_id=self.processor.tokenizer.eos_token_id,
|
| 37 |
+
use_cache=True,
|
| 38 |
+
num_beams=1,
|
| 39 |
+
bad_words_ids=[[self.processor.tokenizer.unk_token_id]],
|
| 40 |
+
return_dict_in_generate=True,
|
| 41 |
+
)
|
| 42 |
+
# process output
|
| 43 |
+
prediction = self.processor.batch_decode(outputs.sequences)[0]
|
| 44 |
+
prediction = self.processor.token2json(prediction)
|
| 45 |
+
|
| 46 |
+
return prediction
|
preprocessor_config.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"do_align_long_axis": false,
|
| 3 |
+
"do_normalize": true,
|
| 4 |
+
"do_pad": true,
|
| 5 |
+
"do_resize": true,
|
| 6 |
+
"do_thumbnail": true,
|
| 7 |
+
"feature_extractor_type": "DonutFeatureExtractor",
|
| 8 |
+
"image_mean": [
|
| 9 |
+
0.5,
|
| 10 |
+
0.5,
|
| 11 |
+
0.5
|
| 12 |
+
],
|
| 13 |
+
"image_std": [
|
| 14 |
+
0.5,
|
| 15 |
+
0.5,
|
| 16 |
+
0.5
|
| 17 |
+
],
|
| 18 |
+
"processor_class": "DonutProcessor",
|
| 19 |
+
"resample": 2,
|
| 20 |
+
"size": [
|
| 21 |
+
960,
|
| 22 |
+
1280
|
| 23 |
+
]
|
| 24 |
+
}
|
pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:31b78e3d3891072de8e2bf3553b71782242a1f3b589b914ec2b03feff7b14c54
|
| 3 |
+
size 806248251
|
res.png
ADDED
|
sample.png
ADDED
|
sentencepiece.bpe.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cb9e3dce4c326195d08fc3dd0f7e2eee1da8595c847bf4c1a9c78b7a82d47e2d
|
| 3 |
+
size 1296245
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"additional_special_tokens": [
|
| 3 |
+
"<s_cord-v2>"
|
| 4 |
+
],
|
| 5 |
+
"bos_token": "<s>",
|
| 6 |
+
"cls_token": "<s>",
|
| 7 |
+
"eos_token": "</s>",
|
| 8 |
+
"mask_token": {
|
| 9 |
+
"content": "<mask>",
|
| 10 |
+
"lstrip": true,
|
| 11 |
+
"normalized": true,
|
| 12 |
+
"rstrip": false,
|
| 13 |
+
"single_word": false
|
| 14 |
+
},
|
| 15 |
+
"pad_token": "<pad>",
|
| 16 |
+
"sep_token": "</s>",
|
| 17 |
+
"unk_token": "<unk>"
|
| 18 |
+
}
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": "<s>",
|
| 3 |
+
"cls_token": "<s>",
|
| 4 |
+
"eos_token": "</s>",
|
| 5 |
+
"from_slow": true,
|
| 6 |
+
"mask_token": {
|
| 7 |
+
"__type": "AddedToken",
|
| 8 |
+
"content": "<mask>",
|
| 9 |
+
"lstrip": true,
|
| 10 |
+
"normalized": true,
|
| 11 |
+
"rstrip": false,
|
| 12 |
+
"single_word": false
|
| 13 |
+
},
|
| 14 |
+
"name_or_path": "naver-clova-ix/donut-base-finetuned-cord-v2",
|
| 15 |
+
"pad_token": "<pad>",
|
| 16 |
+
"processor_class": "DonutProcessor",
|
| 17 |
+
"sep_token": "</s>",
|
| 18 |
+
"sp_model_kwargs": {},
|
| 19 |
+
"special_tokens_map_file": null,
|
| 20 |
+
"tokenizer_class": "XLMRobertaTokenizer",
|
| 21 |
+
"unk_token": "<unk>"
|
| 22 |
+
}
|