aslessor philschmid commited on
Commit
5476d32
·
0 Parent(s):

Duplicate from philschmid/donut-base-finetuned-cord-v2

Browse files

Co-authored-by: Philipp Schmid <[email protected]>

.gitattributes ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.npy filter=lfs diff=lfs merge=lfs -text
14
+ *.npz filter=lfs diff=lfs merge=lfs -text
15
+ *.onnx filter=lfs diff=lfs merge=lfs -text
16
+ *.ot filter=lfs diff=lfs merge=lfs -text
17
+ *.parquet filter=lfs diff=lfs merge=lfs -text
18
+ *.pb filter=lfs diff=lfs merge=lfs -text
19
+ *.pickle filter=lfs diff=lfs merge=lfs -text
20
+ *.pkl filter=lfs diff=lfs merge=lfs -text
21
+ *.pt filter=lfs diff=lfs merge=lfs -text
22
+ *.pth filter=lfs diff=lfs merge=lfs -text
23
+ *.rar filter=lfs diff=lfs merge=lfs -text
24
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
25
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
26
+ *.tflite filter=lfs diff=lfs merge=lfs -text
27
+ *.tgz filter=lfs diff=lfs merge=lfs -text
28
+ *.wasm filter=lfs diff=lfs merge=lfs -text
29
+ *.xz filter=lfs diff=lfs merge=lfs -text
30
+ *.zip filter=lfs diff=lfs merge=lfs -text
31
+ *.zst filter=lfs diff=lfs merge=lfs -text
32
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ tags:
4
+ - donut
5
+ - image-to-text
6
+ - vision
7
+ - endpoints-template
8
+ ---
9
+
10
+ # Fork of [naver-clova-ix/donut-base-finetuned-cord-v2](https://huggingface.co/naver-clova-ix/donut-base-finetuned-cord-v2)
11
+
12
+ > This is fork of [naver-clova-ix/donut-base-finetuned-cord-v2](https://huggingface.co/naver-clova-ix/donut-base-finetuned-cord-v2) implementing a custom `handler.py` as an example for how to use `donut` models with [inference-endpoints](https://hf.co/inference-endpoints)
13
+
14
+ ---
15
+
16
+ # Donut (base-sized model, fine-tuned on CORD)
17
+
18
+ Donut model fine-tuned on CORD. It was introduced in the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewok et al. and first released in [this repository](https://github.com/clovaai/donut).
19
+
20
+ Donut consists of a vision encoder (Swin Transformer) and a text decoder (BART). Given an image, the encoder first encodes the image into a tensor of embeddings (of shape batch_size, seq_len, hidden_size), after which the decoder autoregressively generates text, conditioned on the encoding of the encoder.
21
+
22
+ # Use with Inference Endpoints
23
+
24
+ Hugging Face Inference endpoints can directly work with binary data, this means that we can directly send our image from our document to the endpoint. We are going to use requests to send our requests. (make your you have it installed `pip install requests`)
25
+
26
+ ![result](res.png)
27
+
28
+ ## Send requests with Pyton
29
+
30
+ load sample image
31
+
32
+ ```bash
33
+ wget https://huggingface.co/philschmid/donut-base-finetuned-cord-v2/resolve/main/sample.png
34
+ ```
35
+
36
+ send request to endpoint
37
+
38
+ ```python
39
+ import json
40
+ import requests as r
41
+ import mimetypes
42
+
43
+ ENDPOINT_URL="" # url of your endpoint
44
+ HF_TOKEN="" # organization token where you deployed your endpoint
45
+
46
+ def predict(path_to_image:str=None):
47
+ with open(path_to_image, "rb") as i:
48
+ b = i.read()
49
+ headers= {
50
+ "Authorization": f"Bearer {HF_TOKEN}",
51
+ "Content-Type": mimetypes.guess_type(path_to_image)[0]
52
+ }
53
+ response = r.post(ENDPOINT_URL, headers=headers, data=b)
54
+ return response.json()
55
+
56
+ prediction = predict(path_to_image="sample.png")
57
+
58
+ print(prediction)
59
+ # {'menu': [{'nm': '0571-1854 BLUS WANITA',
60
+ # 'unitprice': '@120.000',
61
+ # 'cnt': '1',
62
+ # 'price': '120,000'},
63
+ # {'nm': '1002-0060 SHOPPING BAG', 'cnt': '1', 'price': '0'}],
64
+ # 'total': {'total_price': '120,000',
65
+ # 'changeprice': '0',
66
+ # 'creditcardprice': '120,000',
67
+ # 'menuqty_cnt': '1'}}
68
+ ```
69
+
70
+
71
+
72
+ **curl example**
73
+
74
+ ```bash
75
+ curl https://ak7gduay2ypyr9vp.us-east-1.aws.endpoints.huggingface.cloud \
76
+ -X POST \
77
+ --data-binary 'sample.png' \
78
+ -H "Authorization: Bearer XXX" \
79
+ -H "Content-Type: null"
80
+ ```
added_tokens.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</s_cashprice>": 57549,
3
+ "</s_changeprice>": 57551,
4
+ "</s_cnt>": 57529,
5
+ "</s_creditcardprice>": 57563,
6
+ "</s_discount_price>": 57557,
7
+ "</s_discountprice>": 57567,
8
+ "</s_emoneyprice>": 57569,
9
+ "</s_etc>": 57541,
10
+ "</s_itemsubtotal>": 57577,
11
+ "</s_menu>": 57525,
12
+ "</s_menuqty_cnt>": 57555,
13
+ "</s_menutype_cnt>": 57553,
14
+ "</s_nm>": 57527,
15
+ "</s_num>": 57565,
16
+ "</s_othersvc_price>": 57573,
17
+ "</s_price>": 57531,
18
+ "</s_service_price>": 57537,
19
+ "</s_sub>": 57547,
20
+ "</s_sub_total>": 57533,
21
+ "</s_subtotal_price>": 57535,
22
+ "</s_tax_price>": 57539,
23
+ "</s_total>": 57543,
24
+ "</s_total_etc>": 57561,
25
+ "</s_total_price>": 57545,
26
+ "</s_unitprice>": 57559,
27
+ "</s_vatyn>": 57575,
28
+ "</s_void_menu>": 57571,
29
+ "<s_cashprice>": 57550,
30
+ "<s_changeprice>": 57552,
31
+ "<s_cnt>": 57530,
32
+ "<s_cord-v2>": 57579,
33
+ "<s_creditcardprice>": 57564,
34
+ "<s_discount_price>": 57558,
35
+ "<s_discountprice>": 57568,
36
+ "<s_emoneyprice>": 57570,
37
+ "<s_etc>": 57542,
38
+ "<s_iitcdip>": 57523,
39
+ "<s_itemsubtotal>": 57578,
40
+ "<s_menu>": 57526,
41
+ "<s_menuqty_cnt>": 57556,
42
+ "<s_menutype_cnt>": 57554,
43
+ "<s_nm>": 57528,
44
+ "<s_num>": 57566,
45
+ "<s_othersvc_price>": 57574,
46
+ "<s_price>": 57532,
47
+ "<s_service_price>": 57538,
48
+ "<s_sub>": 57548,
49
+ "<s_sub_total>": 57534,
50
+ "<s_subtotal_price>": 57536,
51
+ "<s_synthdog>": 57524,
52
+ "<s_tax_price>": 57540,
53
+ "<s_total>": 57544,
54
+ "<s_total_etc>": 57562,
55
+ "<s_total_price>": 57546,
56
+ "<s_unitprice>": 57560,
57
+ "<s_vatyn>": 57576,
58
+ "<s_void_menu>": 57572,
59
+ "<sep/>": 57522
60
+ }
config.json ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "VisionEncoderDecoderModel"
4
+ ],
5
+ "decoder": {
6
+ "_name_or_path": "",
7
+ "activation_dropout": 0.0,
8
+ "activation_function": "gelu",
9
+ "add_cross_attention": true,
10
+ "add_final_layer_norm": true,
11
+ "architectures": null,
12
+ "attention_dropout": 0.0,
13
+ "bad_words_ids": null,
14
+ "bos_token_id": 0,
15
+ "chunk_size_feed_forward": 0,
16
+ "classifier_dropout": 0.0,
17
+ "cross_attention_hidden_size": null,
18
+ "d_model": 1024,
19
+ "decoder_attention_heads": 16,
20
+ "decoder_ffn_dim": 4096,
21
+ "decoder_layerdrop": 0.0,
22
+ "decoder_layers": 4,
23
+ "decoder_start_token_id": null,
24
+ "diversity_penalty": 0.0,
25
+ "do_sample": false,
26
+ "dropout": 0.1,
27
+ "early_stopping": false,
28
+ "encoder_attention_heads": 16,
29
+ "encoder_ffn_dim": 4096,
30
+ "encoder_layerdrop": 0.0,
31
+ "encoder_layers": 12,
32
+ "encoder_no_repeat_ngram_size": 0,
33
+ "eos_token_id": 2,
34
+ "exponential_decay_length_penalty": null,
35
+ "finetuning_task": null,
36
+ "forced_bos_token_id": null,
37
+ "forced_eos_token_id": 2,
38
+ "id2label": {
39
+ "0": "LABEL_0",
40
+ "1": "LABEL_1"
41
+ },
42
+ "init_std": 0.02,
43
+ "is_decoder": true,
44
+ "is_encoder_decoder": false,
45
+ "label2id": {
46
+ "LABEL_0": 0,
47
+ "LABEL_1": 1
48
+ },
49
+ "length_penalty": 1.0,
50
+ "max_length": 20,
51
+ "max_position_embeddings": 768,
52
+ "min_length": 0,
53
+ "model_type": "mbart",
54
+ "no_repeat_ngram_size": 0,
55
+ "num_beam_groups": 1,
56
+ "num_beams": 1,
57
+ "num_hidden_layers": 12,
58
+ "num_return_sequences": 1,
59
+ "output_attentions": false,
60
+ "output_hidden_states": false,
61
+ "output_scores": false,
62
+ "pad_token_id": 1,
63
+ "prefix": null,
64
+ "problem_type": null,
65
+ "pruned_heads": {},
66
+ "remove_invalid_values": false,
67
+ "repetition_penalty": 1.0,
68
+ "return_dict": true,
69
+ "return_dict_in_generate": false,
70
+ "scale_embedding": true,
71
+ "sep_token_id": null,
72
+ "task_specific_params": null,
73
+ "temperature": 1.0,
74
+ "tf_legacy_loss": false,
75
+ "tie_encoder_decoder": false,
76
+ "tie_word_embeddings": true,
77
+ "tokenizer_class": null,
78
+ "top_k": 50,
79
+ "top_p": 1.0,
80
+ "torch_dtype": null,
81
+ "torchscript": false,
82
+ "transformers_version": "4.22.0.dev0",
83
+ "typical_p": 1.0,
84
+ "use_bfloat16": false,
85
+ "use_cache": true,
86
+ "vocab_size": 57580
87
+ },
88
+ "encoder": {
89
+ "_name_or_path": "",
90
+ "add_cross_attention": false,
91
+ "architectures": null,
92
+ "attention_probs_dropout_prob": 0.0,
93
+ "bad_words_ids": null,
94
+ "bos_token_id": null,
95
+ "chunk_size_feed_forward": 0,
96
+ "cross_attention_hidden_size": null,
97
+ "decoder_start_token_id": null,
98
+ "depths": [
99
+ 2,
100
+ 2,
101
+ 14,
102
+ 2
103
+ ],
104
+ "diversity_penalty": 0.0,
105
+ "do_sample": false,
106
+ "drop_path_rate": 0.1,
107
+ "early_stopping": false,
108
+ "embed_dim": 128,
109
+ "encoder_no_repeat_ngram_size": 0,
110
+ "eos_token_id": null,
111
+ "exponential_decay_length_penalty": null,
112
+ "finetuning_task": null,
113
+ "forced_bos_token_id": null,
114
+ "forced_eos_token_id": null,
115
+ "hidden_act": "gelu",
116
+ "hidden_dropout_prob": 0.0,
117
+ "hidden_size": 1024,
118
+ "id2label": {
119
+ "0": "LABEL_0",
120
+ "1": "LABEL_1"
121
+ },
122
+ "image_size": [
123
+ 1280,
124
+ 960
125
+ ],
126
+ "initializer_range": 0.02,
127
+ "is_decoder": false,
128
+ "is_encoder_decoder": false,
129
+ "label2id": {
130
+ "LABEL_0": 0,
131
+ "LABEL_1": 1
132
+ },
133
+ "layer_norm_eps": 1e-05,
134
+ "length_penalty": 1.0,
135
+ "max_length": 20,
136
+ "min_length": 0,
137
+ "mlp_ratio": 4.0,
138
+ "model_type": "donut-swin",
139
+ "no_repeat_ngram_size": 0,
140
+ "num_beam_groups": 1,
141
+ "num_beams": 1,
142
+ "num_channels": 3,
143
+ "num_heads": [
144
+ 4,
145
+ 8,
146
+ 16,
147
+ 32
148
+ ],
149
+ "num_layers": 4,
150
+ "num_return_sequences": 1,
151
+ "output_attentions": false,
152
+ "output_hidden_states": false,
153
+ "output_scores": false,
154
+ "pad_token_id": null,
155
+ "patch_size": 4,
156
+ "path_norm": true,
157
+ "prefix": null,
158
+ "problem_type": null,
159
+ "pruned_heads": {},
160
+ "qkv_bias": true,
161
+ "remove_invalid_values": false,
162
+ "repetition_penalty": 1.0,
163
+ "return_dict": true,
164
+ "return_dict_in_generate": false,
165
+ "sep_token_id": null,
166
+ "task_specific_params": null,
167
+ "temperature": 1.0,
168
+ "tf_legacy_loss": false,
169
+ "tie_encoder_decoder": false,
170
+ "tie_word_embeddings": true,
171
+ "tokenizer_class": null,
172
+ "top_k": 50,
173
+ "top_p": 1.0,
174
+ "torch_dtype": null,
175
+ "torchscript": false,
176
+ "transformers_version": "4.22.0.dev0",
177
+ "typical_p": 1.0,
178
+ "use_absolute_embeddings": false,
179
+ "use_bfloat16": false,
180
+ "window_size": 10
181
+ },
182
+ "is_encoder_decoder": true,
183
+ "model_type": "vision-encoder-decoder",
184
+ "tie_word_embeddings": false,
185
+ "torch_dtype": "float32",
186
+ "transformers_version": null
187
+ }
create_handler.ipynb ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "!pip install transformers --upgrade"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "markdown",
14
+ "metadata": {},
15
+ "source": [
16
+ "## Create Custom Handler for Inference Endpoints\n"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": 17,
22
+ "metadata": {},
23
+ "outputs": [
24
+ {
25
+ "name": "stdout",
26
+ "output_type": "stream",
27
+ "text": [
28
+ "Overwriting handler.py\n"
29
+ ]
30
+ }
31
+ ],
32
+ "source": [
33
+ "%%writefile handler.py\n",
34
+ "from typing import Dict, List, Any\n",
35
+ "from transformers import DonutProcessor, VisionEncoderDecoderModel\n",
36
+ "import torch\n",
37
+ "\n",
38
+ "\n",
39
+ "# check for GPU\n",
40
+ "device = 0 if torch.cuda.is_available() else -1\n",
41
+ "\n",
42
+ "\n",
43
+ "class EndpointHandler:\n",
44
+ " def __init__(self, path=\"\"):\n",
45
+ " # load the model\n",
46
+ " self.processor = DonutProcessor.from_pretrained(path)\n",
47
+ " self.model = VisionEncoderDecoderModel.from_pretrained(path)\n",
48
+ " # move model to device\n",
49
+ " self.model.to(device)\n",
50
+ " self.decoder_input_ids = self.processor.tokenizer(\n",
51
+ " \"<s_cord-v2>\", add_special_tokens=False, return_tensors=\"pt\"\n",
52
+ " ).input_ids\n",
53
+ "\n",
54
+ " def __call__(self, data: Any) -> List[List[Dict[str, float]]]:\n",
55
+ "\n",
56
+ " inputs = data.pop(\"inputs\", data)\n",
57
+ "\n",
58
+ "\n",
59
+ " # preprocess the input\n",
60
+ " pixel_values = self.processor(inputs, return_tensors=\"pt\").pixel_values\n",
61
+ "\n",
62
+ " # forward pass\n",
63
+ " outputs = self.model.generate(\n",
64
+ " pixel_values.to(device),\n",
65
+ " decoder_input_ids=self.decoder_input_ids.to(device),\n",
66
+ " max_length=self.model.decoder.config.max_position_embeddings,\n",
67
+ " early_stopping=True,\n",
68
+ " pad_token_id=self.processor.tokenizer.pad_token_id,\n",
69
+ " eos_token_id=self.processor.tokenizer.eos_token_id,\n",
70
+ " use_cache=True,\n",
71
+ " num_beams=1,\n",
72
+ " bad_words_ids=[[self.processor.tokenizer.unk_token_id]],\n",
73
+ " return_dict_in_generate=True,\n",
74
+ " )\n",
75
+ " # process output\n",
76
+ " prediction = self.processor.batch_decode(outputs.sequences)[0]\n",
77
+ " prediction = self.processor.token2json(prediction)\n",
78
+ "\n",
79
+ " return prediction\n"
80
+ ]
81
+ },
82
+ {
83
+ "cell_type": "markdown",
84
+ "metadata": {},
85
+ "source": [
86
+ "test custom pipeline"
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "code",
91
+ "execution_count": 2,
92
+ "metadata": {},
93
+ "outputs": [],
94
+ "source": [
95
+ "from handler import EndpointHandler\n",
96
+ "\n",
97
+ "my_handler = EndpointHandler(\".\")"
98
+ ]
99
+ },
100
+ {
101
+ "cell_type": "code",
102
+ "execution_count": 18,
103
+ "metadata": {},
104
+ "outputs": [
105
+ {
106
+ "data": {
107
+ "text/plain": [
108
+ "{'menu': [{'nm': '0571-1854 BLUS WANITA',\n",
109
+ " 'unitprice': '@120.000',\n",
110
+ " 'cnt': '1',\n",
111
+ " 'price': '120,000'},\n",
112
+ " {'nm': '1002-0060 SHOPPING BAG', 'cnt': '1', 'price': '0'}],\n",
113
+ " 'total': {'total_price': '120,000',\n",
114
+ " 'changeprice': '0',\n",
115
+ " 'creditcardprice': '120,000',\n",
116
+ " 'menuqty_cnt': '1'}}"
117
+ ]
118
+ },
119
+ "execution_count": 18,
120
+ "metadata": {},
121
+ "output_type": "execute_result"
122
+ }
123
+ ],
124
+ "source": [
125
+ "from PIL import Image\n",
126
+ "\n",
127
+ "payload = {\"inputs\": Image.open(\"sample.png\").convert(\"RGB\")}\n",
128
+ "\n",
129
+ "my_handler(payload)"
130
+ ]
131
+ },
132
+ {
133
+ "cell_type": "code",
134
+ "execution_count": null,
135
+ "metadata": {},
136
+ "outputs": [],
137
+ "source": []
138
+ }
139
+ ],
140
+ "metadata": {
141
+ "kernelspec": {
142
+ "display_name": "Python 3.9.13 ('dev': conda)",
143
+ "language": "python",
144
+ "name": "python3"
145
+ },
146
+ "language_info": {
147
+ "codemirror_mode": {
148
+ "name": "ipython",
149
+ "version": 3
150
+ },
151
+ "file_extension": ".py",
152
+ "mimetype": "text/x-python",
153
+ "name": "python",
154
+ "nbconvert_exporter": "python",
155
+ "pygments_lexer": "ipython3",
156
+ "version": "3.9.13"
157
+ },
158
+ "orig_nbformat": 4,
159
+ "vscode": {
160
+ "interpreter": {
161
+ "hash": "f6dd96c16031089903d5a31ec148b80aeb0d39c32affb1a1080393235fbfa2fc"
162
+ }
163
+ }
164
+ },
165
+ "nbformat": 4,
166
+ "nbformat_minor": 2
167
+ }
handler.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Any
2
+ from transformers import DonutProcessor, VisionEncoderDecoderModel
3
+ import torch
4
+
5
+
6
+ # check for GPU
7
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
8
+
9
+
10
+ class EndpointHandler:
11
+ def __init__(self, path=""):
12
+ # load the model
13
+ self.processor = DonutProcessor.from_pretrained(path)
14
+ self.model = VisionEncoderDecoderModel.from_pretrained(path)
15
+ # move model to device
16
+ self.model.to(device)
17
+ self.decoder_input_ids = self.processor.tokenizer(
18
+ "<s_cord-v2>", add_special_tokens=False, return_tensors="pt"
19
+ ).input_ids
20
+
21
+ def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
22
+
23
+ inputs = data.pop("inputs", data)
24
+
25
+
26
+ # preprocess the input
27
+ pixel_values = self.processor(inputs, return_tensors="pt").pixel_values
28
+
29
+ # forward pass
30
+ outputs = self.model.generate(
31
+ pixel_values.to(device),
32
+ decoder_input_ids=self.decoder_input_ids.to(device),
33
+ max_length=self.model.decoder.config.max_position_embeddings,
34
+ early_stopping=True,
35
+ pad_token_id=self.processor.tokenizer.pad_token_id,
36
+ eos_token_id=self.processor.tokenizer.eos_token_id,
37
+ use_cache=True,
38
+ num_beams=1,
39
+ bad_words_ids=[[self.processor.tokenizer.unk_token_id]],
40
+ return_dict_in_generate=True,
41
+ )
42
+ # process output
43
+ prediction = self.processor.batch_decode(outputs.sequences)[0]
44
+ prediction = self.processor.token2json(prediction)
45
+
46
+ return prediction
preprocessor_config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_align_long_axis": false,
3
+ "do_normalize": true,
4
+ "do_pad": true,
5
+ "do_resize": true,
6
+ "do_thumbnail": true,
7
+ "feature_extractor_type": "DonutFeatureExtractor",
8
+ "image_mean": [
9
+ 0.5,
10
+ 0.5,
11
+ 0.5
12
+ ],
13
+ "image_std": [
14
+ 0.5,
15
+ 0.5,
16
+ 0.5
17
+ ],
18
+ "processor_class": "DonutProcessor",
19
+ "resample": 2,
20
+ "size": [
21
+ 960,
22
+ 1280
23
+ ]
24
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31b78e3d3891072de8e2bf3553b71782242a1f3b589b914ec2b03feff7b14c54
3
+ size 806248251
res.png ADDED
sample.png ADDED
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb9e3dce4c326195d08fc3dd0f7e2eee1da8595c847bf4c1a9c78b7a82d47e2d
3
+ size 1296245
special_tokens_map.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<s_cord-v2>"
4
+ ],
5
+ "bos_token": "<s>",
6
+ "cls_token": "<s>",
7
+ "eos_token": "</s>",
8
+ "mask_token": {
9
+ "content": "<mask>",
10
+ "lstrip": true,
11
+ "normalized": true,
12
+ "rstrip": false,
13
+ "single_word": false
14
+ },
15
+ "pad_token": "<pad>",
16
+ "sep_token": "</s>",
17
+ "unk_token": "<unk>"
18
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "from_slow": true,
6
+ "mask_token": {
7
+ "__type": "AddedToken",
8
+ "content": "<mask>",
9
+ "lstrip": true,
10
+ "normalized": true,
11
+ "rstrip": false,
12
+ "single_word": false
13
+ },
14
+ "name_or_path": "naver-clova-ix/donut-base-finetuned-cord-v2",
15
+ "pad_token": "<pad>",
16
+ "processor_class": "DonutProcessor",
17
+ "sep_token": "</s>",
18
+ "sp_model_kwargs": {},
19
+ "special_tokens_map_file": null,
20
+ "tokenizer_class": "XLMRobertaTokenizer",
21
+ "unk_token": "<unk>"
22
+ }