Spaces:
Runtime error
Runtime error
add japan support lang, change to ppocr-v4, fix several bugs related padding
Browse files- app.py +48 -13
- test_pdf2img.py +16 -0
app.py
CHANGED
|
@@ -3,6 +3,7 @@ import string
|
|
| 3 |
import random
|
| 4 |
from collections import Counter
|
| 5 |
from itertools import count, tee
|
|
|
|
| 6 |
|
| 7 |
import cv2
|
| 8 |
import matplotlib.pyplot as plt
|
|
@@ -14,7 +15,7 @@ from PIL import Image
|
|
| 14 |
from transformers import DetrImageProcessor, TableTransformerForObjectDetection
|
| 15 |
from paddleocr import PaddleOCR
|
| 16 |
|
| 17 |
-
ocr = PaddleOCR(use_angle_cls=True, lang="en",use_gpu=False)
|
| 18 |
|
| 19 |
st.set_option('deprecation.showPyplotGlobalUse', False)
|
| 20 |
st.set_page_config(layout='wide')
|
|
@@ -28,6 +29,10 @@ table_detection_model = TableTransformerForObjectDetection.from_pretrained(
|
|
| 28 |
table_recognition_model = TableTransformerForObjectDetection.from_pretrained(
|
| 29 |
"microsoft/table-transformer-structure-recognition")
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
def PIL_to_cv(pil_img):
|
| 33 |
return cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
|
|
@@ -201,6 +206,32 @@ class TableExtractionPipeline():
|
|
| 201 |
result.paste(pil_img, (left, top))
|
| 202 |
return result
|
| 203 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
def plot_results_detection(self, c1, model, pil_img, prob, boxes,
|
| 205 |
delta_xmin, delta_ymin, delta_xmax, delta_ymax):
|
| 206 |
'''
|
|
@@ -213,7 +244,7 @@ class TableExtractionPipeline():
|
|
| 213 |
|
| 214 |
for p, (xmin, ymin, xmax, ymax) in zip(prob, boxes.tolist()):
|
| 215 |
cl = p.argmax()
|
| 216 |
-
xmin, ymin, xmax, ymax = xmin
|
| 217 |
ax.add_patch(
|
| 218 |
plt.Rectangle((xmin, ymin),
|
| 219 |
xmax - xmin,
|
|
@@ -238,8 +269,7 @@ class TableExtractionPipeline():
|
|
| 238 |
cropped_img_list = []
|
| 239 |
|
| 240 |
for p, (xmin, ymin, xmax, ymax) in zip(prob, boxes.tolist()):
|
| 241 |
-
|
| 242 |
-
xmin, ymin, xmax, ymax = xmin - delta_xmin, ymin - delta_ymin, xmax + delta_xmax, ymax + delta_ymax
|
| 243 |
cropped_img = pil_img.crop((xmin, ymin, xmax, ymax))
|
| 244 |
cropped_img_list.append(cropped_img)
|
| 245 |
|
|
@@ -412,7 +442,8 @@ class TableExtractionPipeline():
|
|
| 412 |
|
| 413 |
@st.cache
|
| 414 |
def convert_df(self, df):
|
| 415 |
-
|
|
|
|
| 416 |
|
| 417 |
def create_dataframe(self, c3, cell_ocr_res: list, max_cols: int,
|
| 418 |
max_rows: int):
|
|
@@ -456,15 +487,15 @@ class TableExtractionPipeline():
|
|
| 456 |
csv = self.convert_df(df)
|
| 457 |
|
| 458 |
try:
|
| 459 |
-
numkey = df.iloc[0, 0]
|
| 460 |
-
except:
|
| 461 |
numkey = str(0)
|
| 462 |
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
|
| 469 |
return df
|
| 470 |
|
|
@@ -548,7 +579,11 @@ class TableExtractionPipeline():
|
|
| 548 |
|
| 549 |
if __name__ == "__main__":
|
| 550 |
|
| 551 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 552 |
st1, st2, st3 = st.columns((1, 1, 1))
|
| 553 |
TD_th = st1.slider('Table detection threshold', 0.0, 1.0, 0.8)
|
| 554 |
TSR_th = st2.slider('Table structure recognition threshold', 0.0, 1.0, 0.7)
|
|
|
|
| 3 |
import random
|
| 4 |
from collections import Counter
|
| 5 |
from itertools import count, tee
|
| 6 |
+
import base64
|
| 7 |
|
| 8 |
import cv2
|
| 9 |
import matplotlib.pyplot as plt
|
|
|
|
| 15 |
from transformers import DetrImageProcessor, TableTransformerForObjectDetection
|
| 16 |
from paddleocr import PaddleOCR
|
| 17 |
|
| 18 |
+
ocr = PaddleOCR(use_angle_cls=True, lang="en", use_gpu=False, ocr_version='PP-OCRv4')
|
| 19 |
|
| 20 |
st.set_option('deprecation.showPyplotGlobalUse', False)
|
| 21 |
st.set_page_config(layout='wide')
|
|
|
|
| 29 |
table_recognition_model = TableTransformerForObjectDetection.from_pretrained(
|
| 30 |
"microsoft/table-transformer-structure-recognition")
|
| 31 |
|
| 32 |
+
def reload_ocr(vlang):
|
| 33 |
+
global ocr
|
| 34 |
+
ocr = PaddleOCR(use_angle_cls=True, lang=vlang, use_gpu=False, ocr_version='PP-OCRv4')
|
| 35 |
+
|
| 36 |
|
| 37 |
def PIL_to_cv(pil_img):
|
| 38 |
return cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
|
|
|
|
| 206 |
result.paste(pil_img, (left, top))
|
| 207 |
return result
|
| 208 |
|
| 209 |
+
@staticmethod
|
| 210 |
+
def dynamic_delta(xmin, ymin, xmax, ymax, delta_xmin, delta_ymin, delta_xmax, delta_ymax, pil_img):
|
| 211 |
+
offset_x = (xmax - xmin) * 0.05
|
| 212 |
+
offset_y = (ymax - ymin) * 0.05
|
| 213 |
+
|
| 214 |
+
w_img, h_img = pil_img.size
|
| 215 |
+
|
| 216 |
+
doxmin = xmin - (delta_xmin + offset_x)
|
| 217 |
+
if (doxmin < 0):
|
| 218 |
+
doxmin = 0
|
| 219 |
+
|
| 220 |
+
doymin = ymin - (delta_ymin + offset_y)
|
| 221 |
+
if (doymin < 0):
|
| 222 |
+
doymin = 0
|
| 223 |
+
|
| 224 |
+
doxmax = xmax + (delta_xmax + offset_x)
|
| 225 |
+
if (doxmax > w_img):
|
| 226 |
+
doxmax = w_img
|
| 227 |
+
|
| 228 |
+
doymax = ymax + (delta_ymax + offset_y)
|
| 229 |
+
if (doymax > h_img):
|
| 230 |
+
doymax = h_img
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
return doxmin, doymin, doxmax, doymax
|
| 234 |
+
|
| 235 |
def plot_results_detection(self, c1, model, pil_img, prob, boxes,
|
| 236 |
delta_xmin, delta_ymin, delta_xmax, delta_ymax):
|
| 237 |
'''
|
|
|
|
| 244 |
|
| 245 |
for p, (xmin, ymin, xmax, ymax) in zip(prob, boxes.tolist()):
|
| 246 |
cl = p.argmax()
|
| 247 |
+
xmin, ymin, xmax, ymax = self.dynamic_delta(xmin, ymin, xmax, ymax, delta_xmin, delta_ymin, delta_xmax, delta_ymax, pil_img)
|
| 248 |
ax.add_patch(
|
| 249 |
plt.Rectangle((xmin, ymin),
|
| 250 |
xmax - xmin,
|
|
|
|
| 269 |
cropped_img_list = []
|
| 270 |
|
| 271 |
for p, (xmin, ymin, xmax, ymax) in zip(prob, boxes.tolist()):
|
| 272 |
+
xmin, ymin, xmax, ymax = self.dynamic_delta(xmin, ymin, xmax, ymax, delta_xmin, delta_ymin, delta_xmax, delta_ymax, pil_img)
|
|
|
|
| 273 |
cropped_img = pil_img.crop((xmin, ymin, xmax, ymax))
|
| 274 |
cropped_img_list.append(cropped_img)
|
| 275 |
|
|
|
|
| 442 |
|
| 443 |
@st.cache
|
| 444 |
def convert_df(self, df):
|
| 445 |
+
csv = df.to_csv(index=False, encoding='utf-8-sig') # utf-8-sig to handle BOM for Excel
|
| 446 |
+
return csv.encode('utf-8')
|
| 447 |
|
| 448 |
def create_dataframe(self, c3, cell_ocr_res: list, max_cols: int,
|
| 449 |
max_rows: int):
|
|
|
|
| 487 |
csv = self.convert_df(df)
|
| 488 |
|
| 489 |
try:
|
| 490 |
+
numkey = str(df.iloc[0, 0])
|
| 491 |
+
except IndexError:
|
| 492 |
numkey = str(0)
|
| 493 |
|
| 494 |
+
# Create a download link with filename and extension
|
| 495 |
+
filename = f"table_{numkey}.csv" # Adjust the filename as needed
|
| 496 |
+
b64_csv = base64.b64encode(csv).decode() # Encode CSV data to base64
|
| 497 |
+
href = f'<a href="data:file/csv;base64,{b64_csv}" download="{filename}">Download {filename}</a>'
|
| 498 |
+
c3.markdown(href, unsafe_allow_html=True)
|
| 499 |
|
| 500 |
return df
|
| 501 |
|
|
|
|
| 579 |
|
| 580 |
if __name__ == "__main__":
|
| 581 |
|
| 582 |
+
st_up, st_lang = st.columns((1, 1))
|
| 583 |
+
img_name = st_up.file_uploader("Upload an image with table(s)")
|
| 584 |
+
lang = st_lang.selectbox('Language', ('en', 'japan'))
|
| 585 |
+
reload_ocr(lang)
|
| 586 |
+
|
| 587 |
st1, st2, st3 = st.columns((1, 1, 1))
|
| 588 |
TD_th = st1.slider('Table detection threshold', 0.0, 1.0, 0.8)
|
| 589 |
TSR_th = st2.slider('Table structure recognition threshold', 0.0, 1.0, 0.7)
|
test_pdf2img.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from pdf2image import convert_from_path
|
| 3 |
+
|
| 4 |
+
# Set the PDF file path
|
| 5 |
+
pdf_path = 'test.pdf'
|
| 6 |
+
|
| 7 |
+
# Convert the first page of the PDF to a JPEG image
|
| 8 |
+
first = 14
|
| 9 |
+
last = 14
|
| 10 |
+
images = convert_from_path(pdf_path, dpi=300, first_page=first, last_page=last, poppler_path=r"C:\poppler-23.07.0\Library\bin")
|
| 11 |
+
|
| 12 |
+
# Save the image file
|
| 13 |
+
image_path = os.path.splitext(pdf_path)[0]
|
| 14 |
+
|
| 15 |
+
for index, image in enumerate(images):
|
| 16 |
+
image.save(image_path + "p" + str(index+first) + '.jpg', 'JPEG')
|