kalle07 commited on
Commit
c839177
·
verified ·
1 Parent(s): ccefb19

Upload 3 files

Browse files
.gitattributes CHANGED
@@ -38,3 +38,4 @@ parser_sevenof9_v1_en.exe filter=lfs diff=lfs merge=lfs -text
38
  parser_sevenof9_v1_1_en.exe filter=lfs diff=lfs merge=lfs -text
39
  parser_sevenof9_v1_2.exe filter=lfs diff=lfs merge=lfs -text
40
  PDF[[:space:]]Parser[[:space:]]-[[:space:]]Sevenof9_v7d.exe filter=lfs diff=lfs merge=lfs -text
 
 
38
  parser_sevenof9_v1_1_en.exe filter=lfs diff=lfs merge=lfs -text
39
  parser_sevenof9_v1_2.exe filter=lfs diff=lfs merge=lfs -text
40
  PDF[[:space:]]Parser[[:space:]]-[[:space:]]Sevenof9_v7d.exe filter=lfs diff=lfs merge=lfs -text
41
+ PDF[[:space:]]Parser[[:space:]]-[[:space:]]Sevenof9_v7e.exe filter=lfs diff=lfs merge=lfs -text
PDF Parser - Sevenof9_v7e.exe ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5c1c7049be77caef46372747771e362ee75605ae68676683286f061faa40b01
3
+ size 40700203
PDF Parser - Sevenof9_v7e.py ADDED
@@ -0,0 +1,1133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import time
4
+ import json
5
+ import wx
6
+ import re
7
+ import platform
8
+ import subprocess
9
+ import threading
10
+ import concurrent.futures
11
+ import multiprocessing
12
+ from concurrent.futures import ProcessPoolExecutor
13
+ from concurrent.futures import ThreadPoolExecutor, as_completed
14
+ import pdfplumber
15
+ import psutil
16
+ import logging
17
+ from pdfminer.pdfparser import PDFParser, PDFSyntaxError
18
+ from pdfminer.pdfdocument import PDFDocument, PDFEncryptionError, PDFPasswordIncorrect
19
+ from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
20
+ from pdfminer.pdfinterp import PDFResourceManager
21
+ from rtree import index
22
+ import numpy as np
23
+ from typing import Any, Dict, Iterable, List, Sequence, Tuple, ClassVar
24
+ from dataclasses import dataclass, field, replace
25
+ import math
26
+
27
+ # --------------------------------------------------------------
28
+ # 1. Configuration & compiled regexes
29
+ # --------------------------------------------------------------
30
+ PARALLEL_THRESHOLD = 16
31
+
32
+
33
+ @dataclass(frozen=True)
34
+ class Config:
35
+ PARALLEL_THRESHOLD: int = 16 # pages per file before we switch to parallel mode
36
+
37
+ # Class‑level constant – accessible via Config.TEXT_EXTRACT_SETTINGS
38
+ TEXT_EXTRACT_SETTINGS: ClassVar[Dict[str, Any]] = {
39
+ "x_tolerance": 1.5,
40
+ "y_tolerance": 2.5,
41
+ "keep_blank_chars": False,
42
+ "use_text_flow": False,
43
+ }
44
+
45
+ LEFT_RIGHT_MARGIN_PCT: float = 5.3
46
+ TOP_BOTTOM_MARGIN_PCT: float = 6.0
47
+
48
+
49
+
50
+ #CID_PATTERN = re.compile(r"\$cid:\d+$") # Fixed: removed incorrect trailing $
51
+ CID_PATTERN = re.compile(r"\(cid:\d+\)")
52
+ # NON_PRINTABLE_RE pattern
53
+ NON_PRINTABLE_RE = re.compile(r"[^\u0000-\uFFFF]", re.DOTALL)
54
+
55
+ def clean_cell_text(text):
56
+ if not isinstance(text, str):
57
+ return ""
58
+ # Remove hyphenated line endings first (only at end of lines)
59
+ #text = re.sub(r'-(?=\s*$)', '', text)
60
+ #text = re.sub(r'-\s+', '', text)
61
+ #text = re.sub(r'-(?:\s+)?$', '', text)
62
+ #text = re.sub(r'-(?=\s*$)', '', text) # Remove trailing hyphens
63
+ #text = re.sub(r'-\s+', ' ', text) # Replace hyphen + spaces with space
64
+ #text = text.replace("-\n", "")
65
+ # Remove CID patterns like (cid:79), (cid:111), etc.
66
+ text = CID_PATTERN.sub("", text)
67
+ # Remove other non-printable characters
68
+ text = NON_PRINTABLE_RE.sub("", text)
69
+ return text.strip()
70
+
71
+ def clamp_bbox(bbox, page_width, page_height, p=3):
72
+ x0, top, x1, bottom = bbox
73
+ x0 = max(0, min(x0, page_width))
74
+ x1 = max(0, min(x1, page_width))
75
+ top = max(0, min(top, page_height))
76
+ bottom = max(0, min(bottom, page_height))
77
+ return round(x0, p), round(top, p), round(x1, p), round(bottom, p)
78
+
79
+
80
+ # Regexes – compile once for speed
81
+ #CID_RE = re.compile(r"^cid:\d+$")
82
+
83
+ # --------------------------------------------------------------
84
+ # 2. Small utilities
85
+ # --------------------------------------------------------------
86
+
87
+ def get_physical_cores():
88
+ count = psutil.cpu_count(logical=False)
89
+ return max(1, count if count else 1) # fallback = 1
90
+ cores = get_physical_cores()
91
+
92
+ # GUI update interval
93
+ def throttle_callback(callback, interval_ms=1):
94
+ last_called = 0
95
+
96
+ def wrapper(status):
97
+ nonlocal last_called
98
+ now = time.time() * 1000 # Time in ms
99
+ if now - last_called >= interval_ms:
100
+ last_called = now
101
+ callback(status)
102
+ return wrapper
103
+
104
+
105
+ def clamp_bbox(bbox: Tuple[float, float, float, float], w: float, h: float) -> Tuple[int, int, int, int]:
106
+ """Clamp a bbox to the page dimensions and round to nearest integer."""
107
+ x0, top, x1, bottom = bbox
108
+ return (
109
+ round(max(0, min(x0, w))),
110
+ round(max(0, min(top, h))),
111
+ round(min(x1, w)),
112
+ round(min(bottom, h)),
113
+ )
114
+
115
+
116
+ def is_valid_cell(cell: Any) -> bool:
117
+ """Return True if a cell contains something meaningful."""
118
+ return bool(str(cell).strip() and len(str(cell).strip()) > 1)
119
+
120
+
121
+
122
+ # Function to suppress PDFMiner logging, reducing verbosity
123
+ def suppress_pdfminer_logging():
124
+ for logger_name in [
125
+ "pdfminer", # Various pdfminer modules to suppress logging from
126
+ "pdfminer.pdfparser",
127
+ "pdfminer.pdfdocument",
128
+ "pdfminer.pdfpage",
129
+ "pdfminer.converter",
130
+ "pdfminer.layout",
131
+ "pdfminer.cmapdb",
132
+ "pdfminer.utils"
133
+ ]:
134
+ logging.getLogger(logger_name).setLevel(logging.ERROR) # Set logging level to ERROR to suppress lower levels
135
+
136
+ suppress_pdfminer_logging()
137
+
138
+ class StatusTracker:
139
+ def __init__(self, total_pages):
140
+ self.start_time = time.time()
141
+ self.total_pages = total_pages
142
+ self.processed_pages = 0
143
+
144
+ def update(self, n=1):
145
+ self.processed_pages += n
146
+
147
+ def get_status(self):
148
+ elapsed = time.time() - self.start_time
149
+ pages_per_sec = round(self.processed_pages / elapsed) if elapsed > 0 else 0
150
+ remaining_pages = self.total_pages - self.processed_pages
151
+ est_time = (remaining_pages / pages_per_sec) / 60 if pages_per_sec > 0 else float('inf')
152
+ return {
153
+ "processed_pages": self.processed_pages,
154
+ "total_pages": self.total_pages,
155
+ "pages_per_sec": pages_per_sec,
156
+ "elapsed_time": round(elapsed / 60, 1),
157
+ "est_time": round(est_time, 1)
158
+ }
159
+
160
+ # --------------------------------------------------------------
161
+ # 3. Data models
162
+ # --------------------------------------------------------------
163
+
164
+ @dataclass(frozen=True)
165
+ class Word:
166
+ text: str
167
+ x0: float
168
+ y0: float
169
+ x1: float
170
+ y1: float
171
+ font_size: float
172
+ font_name: str
173
+ bold: bool
174
+
175
+
176
+ @dataclass
177
+ class Block:
178
+ words: List[Word] = field(default_factory=list)
179
+
180
+ def bbox(self) -> Tuple[float, float, float, float]:
181
+ if not self.words:
182
+ return 0.0, 0.0, 0.0, 0.0
183
+ x0 = min(w.x0 for w in self.words)
184
+ y0 = min(w.y0 for w in self.words)
185
+ x1 = max(w.x1 for w in self.words)
186
+ y1 = max(w.y1 for w in self.words)
187
+ return (x0, y0, x1, y1)
188
+
189
+
190
+ @dataclass
191
+ class ImageInfo:
192
+ bbox: Tuple[float, float, float, float]
193
+ obj: Any # raw image dictionary from pdfplumber
194
+
195
+
196
+ # --------------------------------------------------------------
197
+ # 4. Union‑Find clustering
198
+ # --------------------------------------------------------------
199
+
200
+ class _UnionFind:
201
+ def __init__(self, n: int):
202
+ self.parent = list(range(n))
203
+ self.rank = [0] * n
204
+
205
+ def find(self, x: int) -> int:
206
+ if self.parent[x] != x:
207
+ self.parent[x] = self.find(self.parent[x])
208
+ return self.parent[x]
209
+
210
+ def union(self, a: int, b: int) -> None:
211
+ ra, rb = self.find(a), self.find(b)
212
+ if ra == rb:
213
+ return
214
+ if self.rank[ra] < self.rank[rb]:
215
+ ra, rb = rb, ra
216
+ self.parent[rb] = ra
217
+ if self.rank[ra] == self.rank[rb]:
218
+ self.rank[ra] += 1
219
+
220
+
221
+ # --------------------------------------------------------------
222
+ # 4. Union‑Find clustering
223
+ # --------------------------------------------------------------
224
+
225
+ def cluster_words(words: Sequence[Word], max_dx: int, max_dy: int) -> List[Block]:
226
+ """Group words into blocks based on proximity using optimized neighbor search."""
227
+ n = len(words)
228
+ if n == 0:
229
+ return []
230
+
231
+ uf = _UnionFind(n)
232
+
233
+ def is_neighbor(word1: Word, word2: Word) -> bool:
234
+ dx = max(0.0, max(word1.x0 - word2.x1, word2.x0 - word1.x1))
235
+ dy = max(0.0, max(word1.y0 - word2.y1, word2.y0 - word1.y0))
236
+ return dx <= max_dx and dy <= max_dy
237
+
238
+ # Track which words have already been processed (4 neighbors found)
239
+ processed = [False] * n
240
+
241
+ for i in range(n):
242
+ if processed[i]:
243
+ continue
244
+
245
+ neighbor_count = 0
246
+ neighbors_found = []
247
+
248
+ # Check against ALL other words - the key optimization is to stop early
249
+ for j in range(n):
250
+ if i == j:
251
+ continue
252
+
253
+ word1, word2 = words[i], words[j]
254
+
255
+ if is_neighbor(word1, word2):
256
+ neighbors_found.append(j)
257
+ neighbor_count += 1
258
+
259
+ # Early stopping as per your requirements:
260
+ # 1. If we have at least 2 neighbors, the word belongs to a text block
261
+ # 2. If we already have 4 neighbors (max possible in 2D), stop processing this word
262
+ if neighbor_count >= 2:
263
+ # Union with all found neighbors so far
264
+ for k in neighbors_found:
265
+ uf.union(i, k)
266
+
267
+ # Second early stop - no need to check further when 4 neighbors found
268
+ if neighbor_count >= 4:
269
+ processed[i] = True
270
+ break
271
+
272
+ # Continue processing other words even if current word had < 2 neighbors
273
+
274
+ # Build clusters
275
+ clusters: Dict[int, List[Word]] = {}
276
+ for idx in range(n):
277
+ root = uf.find(idx)
278
+ clusters.setdefault(root, []).append(words[idx])
279
+
280
+ # Return as list of Blocks
281
+ return [Block(wlist) for wlist in clusters.values()]
282
+
283
+
284
+
285
+
286
+
287
+
288
+ # --------------------------------------------------------------
289
+ # 5. Character index (vectorised)
290
+ # --------------------------------------------------------------
291
+
292
+ @dataclass
293
+ class CharIndex:
294
+ xs0: np.ndarray
295
+ xs1: np.ndarray
296
+ tops: np.ndarray
297
+ bottoms: np.ndarray
298
+ texts: List[str]
299
+ fonts: List[str]
300
+ sizes: np.ndarray
301
+
302
+ @classmethod
303
+ def build(cls, chars: Sequence[Dict[str, Any]]) -> "CharIndex":
304
+ return cls(
305
+ xs0=np.array([float(c["x0"]) for c in chars]),
306
+ xs1=np.array([float(c["x1"]) for c in chars]),
307
+ tops=np.array([float(c["top"]) for c in chars]),
308
+ bottoms=np.array([float(c["bottom"]) for c in chars]),
309
+ texts=[c.get("text", "") for c in chars],
310
+ fonts=[c.get("fontname", "") for c in chars],
311
+ sizes=np.array([float(c.get("size", 0)) for c in chars]),
312
+ )
313
+
314
+ def inside(self, x0: float, x1: float, y0: float, y1: float) -> np.ndarray:
315
+ return (
316
+ (self.xs0 >= x0)
317
+ & (self.xs1 <= x1)
318
+ & (self.tops >= y0)
319
+ & (self.bottoms <= y1)
320
+ )
321
+
322
+
323
+ # --------------------------------------------------------------
324
+ # 6. Core extraction helpers
325
+ # --------------------------------------------------------------
326
+
327
+ def _extract_tables(page: pdfplumber.page.Page) -> List[Tuple[str, Any]]:
328
+ """Return a list of JSON strings representing tables."""
329
+ suppress_pdfminer_logging()
330
+ raw_tables = page.extract_tables({"text_x_tolerance": Config.TEXT_EXTRACT_SETTINGS["x_tolerance"]})
331
+ jsons = []
332
+
333
+ for tbl in raw_tables:
334
+ if not tbl or len(tbl) < 2: # ignore tiny tables
335
+ continue
336
+
337
+ # filter out empty tables
338
+ if all(all(not is_valid_cell(cell) for cell in row if row) for row in tbl):
339
+ continue
340
+
341
+ cleaned = [[clean_cell_text(c) for c in row] for row in tbl]
342
+ header = cleaned[0]
343
+
344
+ if header[0].strip() == "":
345
+ # corner‑empty table
346
+ col_headers = header[1:]
347
+ row_headers = [row[0] for row in cleaned[1:]]
348
+ data_rows = cleaned[1:]
349
+
350
+ table_dict = {}
351
+ for rh, row in zip(row_headers, data_rows):
352
+ table_dict[rh] = dict(zip(col_headers, row[1:]))
353
+ else:
354
+ # normal header‑table
355
+ headers = header
356
+ data_rows = cleaned[1:]
357
+ table_dict = [dict(zip(headers, row)) for row in data_rows if len(row) == len(headers)]
358
+
359
+ jsons.append(json.dumps(table_dict, indent=1, ensure_ascii=False))
360
+ return jsons
361
+
362
+
363
+ def _filter_words(
364
+ words: List[Dict[str, Any]],
365
+ tables_bboxes: List[Tuple[int, int, int, int]],
366
+ ) -> List[Dict[str, Any]]:
367
+ """Remove words that overlap a table or contain non‑printable chars."""
368
+ filtered = []
369
+ for w in words:
370
+ x0, top = float(w["x0"]), float(w["top"])
371
+ if any(bx0 <= x0 <= bx2 and by0 <= top <= by3 for bx0, by0, bx2, by3 in tables_bboxes):
372
+ continue
373
+ clean_text = clean_cell_text(w["text"])
374
+ if NON_PRINTABLE_RE.search(w["text"]):
375
+ continue
376
+ w["text"] = clean_text
377
+ filtered.append(w)
378
+ return filtered
379
+
380
+
381
+ def _build_word_info(
382
+ words: List[Dict[str, Any]],
383
+ char_index: CharIndex,
384
+ ) -> List[Word]:
385
+ """Convert raw pdfplumber words into Word dataclass instances."""
386
+ def is_bold(name: str) -> bool:
387
+ n = name.lower()
388
+ return "bold" in n or "bd" in n or "black" in n
389
+
390
+ word_objs: List[Word] = []
391
+ for w in words:
392
+ x0, y0, x1, y1 = map(float, (w["x0"], w["top"], w["x1"], w["bottom"]))
393
+ mask = char_index.inside(x0, x1, y0, y1)
394
+ sizes = char_index.sizes[mask]
395
+ fonts = [char_index.fonts[i] for i in np.nonzero(mask)[0]]
396
+ bolds = [is_bold(f) for f in fonts]
397
+
398
+ font_size = float(sizes.max()) if sizes.size else 0.0
399
+ word_objs.append(
400
+ Word(
401
+ text=w["text"],
402
+ x0=x0,
403
+ y0=y0,
404
+ x1=x1,
405
+ y1=y1,
406
+ font_size=font_size,
407
+ font_name=fonts[0] if fonts else "Unknown",
408
+ bold=bool(bolds),
409
+ )
410
+ )
411
+ return word_objs
412
+
413
+ '''
414
+ def _group_blocks(
415
+ words: List[Word],
416
+ page_width: float,
417
+ page_height: float,
418
+ ) -> List[Block]:
419
+ """Cluster words into logical blocks using Union-Find."""
420
+ # thresholds in pixel – derived from percentages
421
+ max_dx = int(round(page_width * 0.0129)) # 1.29 %, ~15px
422
+ max_dy = int(round(page_height * 0.0143)) # 1.43 %, ~25px
423
+
424
+ blocks = cluster_words(words, max_dx, max_dy)
425
+
426
+ # Filter out empty blocks and single-character printable blocks
427
+ filtered_blocks = []
428
+ for block in blocks:
429
+ # Combine all text from words in this block
430
+ combined_text = " ".join(w.text for w in block.words)
431
+
432
+ # Check if the block is not empty after stripping whitespace
433
+ stripped_text = combined_text.strip()
434
+
435
+ # Filter out blocks that are:
436
+ # 1. Empty (only whitespace)
437
+ # 2. Contain only one printable character
438
+ if stripped_text and len(stripped_text) > 1:
439
+ # Additional check for single printable characters
440
+ # Remove all whitespace characters to count actual content
441
+ printable_chars = ''.join(char for char in stripped_text if not char.isspace())
442
+
443
+ # Only keep blocks with more than one printable character
444
+ if len(printable_chars) > 1:
445
+ filtered_blocks.append(block)
446
+
447
+ return filtered_blocks
448
+ '''
449
+
450
+ def _group_blocks(
451
+ words: List[Word],
452
+ page_width: float,
453
+ page_height: float,
454
+ ) -> List[Block]:
455
+ """Cluster words into logical blocks using Union-Find, cleaning text and merging hyphen-split words."""
456
+
457
+ merged_words = []
458
+ skip_next = False
459
+
460
+ for i, word in enumerate(words):
461
+ if skip_next:
462
+ skip_next = False
463
+ continue
464
+
465
+ text = word.text.strip()
466
+
467
+ # If word ends with a hyphen (possibly at a line break), merge with next
468
+ if text.endswith('-') and i + 1 < len(words):
469
+ next_word = words[i + 1]
470
+ merged_text = re.sub(r'-\s*$', '', text) + next_word.text.lstrip()
471
+ merged_word = replace(word, text=merged_text)
472
+ merged_words.append(merged_word)
473
+ skip_next = True
474
+ else:
475
+ # Clean trailing hyphens and extra spaces (no merge)
476
+ cleaned_text = re.sub(r'-\s*$', '', text).strip()
477
+ if cleaned_text != text:
478
+ word = replace(word, text=cleaned_text)
479
+ merged_words.append(word)
480
+
481
+ # thresholds in pixel – derived from percentages
482
+ max_dx = int(round(page_width * 0.0151)) # 1.51 %, ~9px
483
+ max_dy = int(round(page_height * 0.0143)) # 1.43 %, ~12px
484
+
485
+ blocks = cluster_words(merged_words, max_dx, max_dy)
486
+
487
+ # Filter out empty blocks and single-character printable blocks
488
+ filtered_blocks = []
489
+ for block in blocks:
490
+ combined_text = " ".join(w.text for w in block.words)
491
+ stripped_text = combined_text.strip()
492
+
493
+ if stripped_text and len(stripped_text) > 1:
494
+ printable_chars = ''.join(c for c in stripped_text if not c.isspace())
495
+ if len(printable_chars) > 1:
496
+ filtered_blocks.append(block)
497
+
498
+ return filtered_blocks
499
+
500
+
501
+
502
+
503
+
504
+
505
+ # --------------------------------------------------------------
506
+ # 7. Page worker – orchestrator
507
+ # --------------------------------------------------------------
508
+
509
+ def process_page_worker(args: Tuple[int, str]) -> Tuple[int, str]:
510
+ """Process a single page; returns (page_number, rendered_text)."""
511
+ try:
512
+ page_no, path = args
513
+
514
+ with pdfplumber.open(path) as pdf:
515
+ page = pdf.pages[page_no]
516
+ w, h = page.width, page.height
517
+
518
+ # Crop margins
519
+ margin_x = w * Config.LEFT_RIGHT_MARGIN_PCT / 100.0
520
+ margin_y = h * Config.TOP_BOTTOM_MARGIN_PCT / 100.0
521
+ cropped_page = page.crop((margin_x, margin_y, w - margin_x, h - margin_y))
522
+
523
+ # ---------- Tables ----------
524
+ tables_json = _extract_tables(cropped_page)
525
+
526
+ # ---------- Words ----------
527
+ table_bboxes = [clamp_bbox(t.bbox, w, h) for t in cropped_page.find_tables()]
528
+ raw_words = cropped_page.extract_words(**Config.TEXT_EXTRACT_SETTINGS)
529
+ # Clean line break artifacts from PDF text extraction
530
+ filtered_raw = _filter_words(raw_words, table_bboxes)
531
+ char_index = CharIndex.build(cropped_page.chars)
532
+
533
+ words = _build_word_info(filtered_raw, char_index)
534
+ avg_font_size = float(np.mean([w.font_size for w in words])) if words else 0.0
535
+
536
+ # ---------- Blocks ----------
537
+ blocks = _group_blocks(words, w, h)
538
+
539
+ # ---------- Sorting (reading order) ----------
540
+ def reading_score(block: Block) -> Tuple[float, float]:
541
+ x0, y0, x1, y1 = block.bbox()
542
+ height = y1 - y0
543
+ width = x1 - x0
544
+ area_log = math.log1p(width * height)
545
+ return (y0 * 0.7 + x0 * 0.3 - area_log * 0.05, y0)
546
+
547
+ blocks.sort(key=reading_score)
548
+
549
+ # ---------- Images ----------
550
+ images: List[ImageInfo] = []
551
+ for im in cropped_page.images:
552
+ img_bbox = (
553
+ float(im["x0"]),
554
+ h - float(im["y1"]),
555
+ float(im["x1"]),
556
+ h - float(im["y0"]),
557
+ )
558
+ images.append(ImageInfo(bbox=img_bbox, obj=im))
559
+
560
+ # ---------- Assemble output ----------
561
+ lines: List[str] = [f"\n\n--- Page {page_no + 1} ---\n\n"]
562
+
563
+ # ---------- Identify small blocks near large blocks ----------
564
+ large_blocks: List[Block] = []
565
+ small_blocks: List[Block] = []
566
+
567
+ for block in blocks:
568
+ x0, y0, x1, y1 = block.bbox()
569
+ area = (x1 - x0) * (y1 - y0)
570
+
571
+ if area > 3000:
572
+ large_blocks.append(block)
573
+ else:
574
+ small_blocks.append(block)
575
+
576
+ # Group small blocks near large blocks
577
+ nearby_small_blocks: List[Tuple[Block, List[List[Word]]]] = []
578
+
579
+ for small_block in small_blocks:
580
+ x0_s, y0_s, x1_s, y1_s = small_block.bbox()
581
+ small_area = (x1_s - x0_s) * (y1_s - y0_s)
582
+
583
+ # Only process small blocks under 7000 pixels
584
+ if small_area >= 3000:
585
+ continue
586
+
587
+ # Check proximity to large blocks
588
+ for large_block in large_blocks:
589
+ x0_l, y0_l, x1_l, y1_l = large_block.bbox()
590
+ large_area = (x1_l - x0_l) * (y1_l - y0_l)
591
+
592
+ # Only consider large blocks over 7000 pixels
593
+ if large_area <= 3000:
594
+ continue
595
+
596
+ # Calculate distance between blocks
597
+ dx = max(0, max(x0_s, x0_l) - min(x1_s, x1_l))
598
+ dy = max(0, max(y0_s, y0_l) - min(y1_s, y1_l))
599
+
600
+ # If within 25 pixels proximity
601
+ if dx >= 25 and dy >= 25:
602
+ # Sort words in the block for consistent output
603
+ sorted_words = sorted(small_block.words, key=lambda w: (w.y0, w.x0))
604
+ nearby_small_blocks.append((small_block, [sorted_words]))
605
+ break
606
+
607
+ # ---------- Process regular blocks ----------
608
+ for block in blocks:
609
+ # Skip already processed small blocks that are near large ones
610
+ if any(block is small_block for small_block, _ in nearby_small_blocks):
611
+ continue
612
+
613
+ # ------------------------------------------------------------------
614
+ # One‑line per block (preserve any internal \n or \r)
615
+ # ------------------------------------------------------------------
616
+ sorted_words = sorted(block.words, key=lambda w: (w.y0, w.x0))
617
+ combined_text = " ".join(w.text for w in sorted_words)
618
+
619
+ # ------------------------------------------------------------------
620
+ # Labeling heuristics (unchanged from your original logic)
621
+ # ------------------------------------------------------------------
622
+ chapter_hits = 0
623
+ important_hits = 0
624
+ for wobj in block.words:
625
+ # Skip words with fewer than 4 letters and all numbers (no alphabetic characters)
626
+ if len(wobj.text) < 4 and not any(c.isalpha() for c in wobj.text):
627
+ continue
628
+ size_ratio = (
629
+ wobj.font_size / avg_font_size if avg_font_size else 0.0
630
+ )
631
+ if size_ratio >= 1.15:
632
+ chapter_hits += 1
633
+ elif wobj.bold and size_ratio >= 1.0:
634
+ important_hits += 1
635
+
636
+ label: str | None = None
637
+ hits = chapter_hits + important_hits
638
+ if hits > 1 or (hits == 1 and chapter_hits):
639
+ label = "CHAPTER" if chapter_hits else "IMPORTANT"
640
+
641
+ # ------------------------------------------------------------------
642
+ # Append block text (single line) and an empty line afterwards
643
+ # ------------------------------------------------------------------
644
+ if label:
645
+ line_text = f"[{label}] {combined_text}"
646
+ else:
647
+ line_text = combined_text
648
+
649
+ lines.append(line_text)
650
+ lines.append("") # <‑ blank line after every text block
651
+
652
+ # ---------- Tables ----------
653
+ for idx, tbl_json in enumerate(tables_json, 1):
654
+ lines.append(f'"table {idx}":\n{tbl_json}')
655
+
656
+ # ---------- Nearby small blocks (near large blocks) ----------
657
+ if nearby_small_blocks:
658
+ lines.append("\n--- Blocks with unsorted small text snippets far away from large blocks. ---")
659
+ for i, (blk, lns) in enumerate(nearby_small_blocks, 1):
660
+ lines.append(f"Block {i}:")
661
+ for j, line_words in enumerate(lns):
662
+ txt = " ".join(w.text for w in line_words)
663
+ lines.append(txt)
664
+
665
+ return page_no, "\n".join(lines)
666
+
667
+
668
+ except Exception as exc: # pragma: no cover
669
+ err_msg = f"[ERROR] Seite {page_no + 1}: {exc.__class__.__name__}: {exc}"
670
+ logging.exception(err_msg)
671
+ return page_no, err_msg
672
+
673
+
674
+ def run_serial(path, page_number, tracker=None, progress_callback=None, stop_flag=None):
675
+ results = []
676
+ for i in range(page_number):
677
+ if stop_flag and stop_flag.is_set():
678
+ break
679
+ result = process_page_worker((i, path,))
680
+ results.append(result)
681
+ if tracker is not None:
682
+ tracker.update()
683
+ if progress_callback and tracker is not None:
684
+ report_status(tracker, progress_callback)
685
+ return results
686
+
687
+
688
+
689
+ def run_parallel(path, page_number, tracker=None, progress_callback=None, stop_flag=None):
690
+ args = [(i, path) for i in range(page_number)]
691
+ results = [None] * page_number
692
+
693
+ def callback(result):
694
+ if result is None:
695
+ return
696
+ page, _ = result
697
+ results[page] = result
698
+ if tracker is not None:
699
+ tracker.update()
700
+ if progress_callback and tracker is not None:
701
+ report_status(tracker, progress_callback)
702
+
703
+ max_workers = min(page_number, get_physical_cores())
704
+ with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
705
+ futures = {executor.submit(process_page_worker, arg): arg for arg in args}
706
+ for future in concurrent.futures.as_completed(futures):
707
+ callback(future.result())
708
+
709
+ return [r for r in results if r]
710
+
711
+
712
+
713
+
714
+
715
+ def report_status(tracker, progress_callback=None):
716
+ status = tracker.get_status()
717
+ if progress_callback:
718
+ progress_callback(status)
719
+ else:
720
+ print(f"[STATUS] {status['processed_pages']}/{status['total_pages']} Seiten "
721
+ f"({status['pages_per_sec']:} Seiten/s, "
722
+ f"Elapsed: {status['elapsed_time']} Sek.)"
723
+ f"Est Time: {status['est_time']} Sek.)")
724
+
725
+
726
+ def save_pdf(path, page_number, tracker=None, parallel=False, progress_callback=None, stop_flag=None):
727
+ if stop_flag and stop_flag.is_set():
728
+ return 0
729
+
730
+ if parallel:
731
+ results = run_parallel(path, page_number, tracker, progress_callback, stop_flag)
732
+ else:
733
+ results = run_serial(path, page_number, tracker, progress_callback, stop_flag)
734
+
735
+ results = [r for r in results if r] # Filter None (bei Stop)
736
+
737
+ results.sort(key=lambda x: x[0])
738
+ text_output = "\n".join(text for _, text in results)
739
+
740
+ out_path = os.path.splitext(path)[0] + ".txt"
741
+ with open(out_path, "w", encoding="utf-8", errors="ignore") as f:
742
+ f.write(text_output)
743
+
744
+ return page_number
745
+
746
+
747
+
748
+ def _process_single_pdf(path):
749
+ suppress_pdfminer_logging()
750
+ try:
751
+ with open(path, "rb") as f:
752
+ parser = PDFParser(f)
753
+ document = PDFDocument(parser)
754
+
755
+ if not document.is_extractable:
756
+ raise PDFTextExtractionNotAllowed("Text-Extraktion nicht erlaubt")
757
+
758
+ pages = list(PDFPage.create_pages(document))
759
+ return (path, len(pages), None)
760
+
761
+ except (PDFEncryptionError, PDFPasswordIncorrect) as e:
762
+ return (path, 0, f"[ERROR] Datei passwortgeschützt: {path} ({type(e).__name__}: {e})\n")
763
+ except PDFSyntaxError as e:
764
+ return (path, 0, f"[ERROR] Ungültige PDF-Syntax: {path} ({type(e).__name__}: {e})\n")
765
+ except PDFTextExtractionNotAllowed as e:
766
+ return (path, 0, f"[ERROR] Text-Extraktion nicht erlaubt: {path} ({type(e).__name__}: {e})\n")
767
+ except Exception as e:
768
+ return (path, 0, f"[ERROR] Fehler bei Datei {path}: {type(e).__name__}: {e}\n")
769
+
770
+ def get_total_pages(pdf_files, error_callback=None, progress_callback=None):
771
+ suppress_pdfminer_logging()
772
+ total = 0
773
+ page_info = []
774
+
775
+ def handle_result(path, count, error):
776
+ nonlocal total
777
+ if error:
778
+ if error_callback:
779
+ error_callback(error)
780
+ else:
781
+ print(error, end="")
782
+ else:
783
+ page_info.append((path, count))
784
+ total += count
785
+ if progress_callback:
786
+ progress_callback(total) # Rückmeldung an GUI
787
+
788
+ if len(pdf_files) > 14:
789
+ with concurrent.futures.ProcessPoolExecutor(max_workers=cores) as executor:
790
+ results = executor.map(_process_single_pdf, pdf_files)
791
+ for path, count, error in results:
792
+ handle_result(path, count, error)
793
+ else:
794
+ for path in pdf_files:
795
+ path, count, error = _process_single_pdf(path)
796
+ handle_result(path, count, error)
797
+
798
+ return page_info, total
799
+
800
+
801
+
802
+
803
+ # -------------------- GUI --------------------
804
+ class FileManager(wx.Frame):
805
+ def __init__(self, parent):
806
+ super().__init__(parent, title="PDF Parser - Sevenof9_v7e", size=(1000, 800))
807
+ self.files = []
808
+ self.InitUI()
809
+ self.stop_flag = threading.Event()
810
+
811
+ def InitUI(self):
812
+ panel = wx.Panel(self)
813
+ vbox = wx.BoxSizer(wx.VERTICAL)
814
+
815
+ hbox_lbl1 = wx.BoxSizer(wx.HORIZONTAL)
816
+
817
+ lbl1 = wx.StaticText(panel, label="Filed PDF files: (with right mouse you can remove and open)")
818
+ hbox_lbl1.Add(lbl1, flag=wx.ALIGN_CENTER_VERTICAL | wx.LEFT, border=10)
819
+
820
+ hbox_lbl1.AddStretchSpacer() # <== schiebt den Button ganz nach rechts
821
+
822
+ help_btn = wx.Button(panel, label="? HELP ?", size=(60, 25))
823
+ help_btn.Bind(wx.EVT_BUTTON, self.ShowHelpText)
824
+ hbox_lbl1.Add(help_btn, flag=wx.RIGHT, border=10)
825
+
826
+ vbox.Add(hbox_lbl1, flag=wx.EXPAND | wx.TOP, border=10)
827
+
828
+
829
+ self.listbox = wx.ListBox(panel, style=wx.LB_EXTENDED)
830
+ self.listbox.Bind(wx.EVT_RIGHT_DOWN, self.OnRightClick)
831
+ self.listbox.Bind(wx.EVT_LISTBOX, self.ShowText)
832
+ vbox.Add(self.listbox, proportion=1, flag=wx.EXPAND | wx.LEFT | wx.RIGHT, border=10)
833
+
834
+ self.popup_menu = wx.Menu()
835
+ self.popup_menu.Append(1, "Remove selected")
836
+ self.popup_menu.Append(2, "Open in default PDF app")
837
+ self.popup_menu.Append(3, "Copy File Location")
838
+ self.popup_menu.Append(4, "Open File Location")
839
+ self.Bind(wx.EVT_MENU, self.RemoveFile, id=1)
840
+ self.Bind(wx.EVT_MENU, self.OpenPDF, id=2)
841
+ self.Bind(wx.EVT_MENU, self.CopyFileLocation, id=3)
842
+ self.Bind(wx.EVT_MENU, self.OpenFileLocation, id=4)
843
+
844
+
845
+ btn_panel = wx.Panel(panel)
846
+ btn_sizer = wx.BoxSizer(wx.HORIZONTAL)
847
+ for label, handler in [
848
+ ("Add Folder", self.AddFolder),
849
+ ("Select Files", self.AddFile),
850
+ ("Remove Selected", self.RemoveFile),
851
+ ("Remove All", self.RemoveAll),
852
+ ("Stop Parser", self.StopParser),
853
+ ("Start Parser", self.StartParser)
854
+ ]:
855
+ btn = wx.Button(btn_panel, label=label)
856
+ btn.Bind(wx.EVT_BUTTON, handler)
857
+ if label == "Start Parser":
858
+ self.start_btn = btn # <-- Referenz merken
859
+ btn_sizer.Add(btn, proportion=1, flag=wx.ALL, border=5)
860
+ btn_panel.SetSizer(btn_sizer)
861
+ vbox.Add(btn_panel, flag=wx.EXPAND | wx.LEFT | wx.RIGHT, border=10)
862
+
863
+
864
+ lbl2 = wx.StaticText(panel, label="Text Frame: (choose PDF to see converted text)")
865
+ vbox.Add(lbl2, flag=wx.LEFT, border=10)
866
+
867
+ self.text_ctrl = wx.TextCtrl(panel, style=wx.TE_MULTILINE | wx.TE_READONLY)
868
+ self.ShowHelpText(None)
869
+ vbox.Add(self.text_ctrl, proportion=1, flag=wx.EXPAND | wx.LEFT | wx.RIGHT, border=10)
870
+
871
+ # Statusanzeige
872
+ stat_grid = wx.FlexGridSizer(1, 5, 5, 55)
873
+ self.lbl_processed_pages = wx.StaticText(panel, label="Processed pages: 0")
874
+ self.lbl_total_pages = wx.StaticText(panel, label="Total pages: 0")
875
+ self.lbl_pages_per_sec = wx.StaticText(panel, label="Pages/sec: 0")
876
+ self.lbl_est_time = wx.StaticText(panel, label="Estimated time (min): 0.0")
877
+ self.lbl_elapsed_time = wx.StaticText(panel, label="Elapsed time: 0.0")
878
+
879
+ for lbl in [self.lbl_processed_pages, self.lbl_total_pages, self.lbl_pages_per_sec, self.lbl_est_time, self.lbl_elapsed_time]:
880
+ stat_grid.Add(lbl)
881
+ vbox.Add(stat_grid, flag=wx.LEFT | wx.TOP, border=10)
882
+
883
+ self.prog_ctrl = wx.TextCtrl(panel, style=wx.TE_MULTILINE | wx.TE_READONLY)
884
+ vbox.Add(self.prog_ctrl, proportion=1, flag=wx.EXPAND | wx.ALL, border=10)
885
+
886
+ panel.SetSizer(vbox)
887
+
888
+
889
+ def ShowHelpText(self, event):
890
+ help_text = (
891
+ " This is a small help\n\n"
892
+ " • PRE ALPHA version (for ever) •\n"
893
+ "• The generated TXT file has the same name as the PDF file\n"
894
+ "• The TXT file is created in the same directory as the PDF\n"
895
+ "• Older TXT files will be overwritten without prompting\n"
896
+ "• When selecting a folder, subfolders are also selected\n"
897
+ "If:\n"
898
+ "[INFO] File completed: TEST.pdf (X pages)!\n"
899
+ "[INFO] Processing completed\n"
900
+ "-> This only means that all pages have been processed; it does not mean that the quality is good.\n"
901
+ "• An attempt is made to reproduce the layout of the page in columns from left to right and in blocks from top to bottom\n"
902
+ "• An attempt is made to detect regular tables with lines; headers (top or top and left) are assigned to the cells and stored in JSON format in the text file\n"
903
+ "• Adds the label “Page X” at the beginning of every page (absdlute number)\n"
904
+ "• Adds the label “Chapter” for large font and/or “important” for bold font\n"
905
+ "\n"
906
+ "Stop function becomes effective only after the currently processed file\n"
907
+ "When processing large amounts of data, the following should be noted:\n"
908
+ "First, all PDFs are opened once to determine the number of pages:\n"
909
+ "Then, all small PDFs are processed in parallel:\n"
910
+ "Then, each large PDF is processed page by page in parallel:\n"
911
+ )
912
+ self.text_ctrl.SetValue(help_text)
913
+
914
+
915
+ def AddFolder(self, event):
916
+ dlg = wx.DirDialog(self, "Select Folder")
917
+ if dlg.ShowModal() == wx.ID_OK:
918
+ for root, _, files in os.walk(dlg.GetPath()):
919
+ for f in files:
920
+ if f.lower().endswith(".pdf"):
921
+ path = os.path.normpath(os.path.join(root, f))
922
+ if path not in self.files:
923
+ self.files.append(path)
924
+ self.listbox.Append(path)
925
+ dlg.Destroy()
926
+
927
+ def AddFile(self, event):
928
+ with wx.FileDialog(self, "Select PDF Files", wildcard="PDF files (*.pdf)|*.pdf",
929
+ style=wx.FD_OPEN | wx.FD_MULTIPLE) as dlg:
930
+ if dlg.ShowModal() == wx.ID_OK:
931
+ for path in dlg.GetPaths():
932
+ if path not in self.files:
933
+ self.files.append(path)
934
+ self.listbox.Append(path)
935
+
936
+ def RemoveFile(self, event):
937
+ for i in reversed(self.listbox.GetSelections()):
938
+ self.listbox.Delete(i)
939
+ del self.files[i]
940
+ self.text_ctrl.Clear()
941
+
942
+ def RemoveAll(self, event):
943
+ self.listbox.Clear()
944
+ self.files.clear()
945
+ self.text_ctrl.Clear()
946
+
947
+ def OpenPDF(self, event):
948
+ i = self.listbox.GetSelections()
949
+ if i:
950
+ path = self.files[i[0]]
951
+ if platform.system() == "Windows":
952
+ os.startfile(path)
953
+ elif platform.system() == "Darwin":
954
+ subprocess.call(["open", path])
955
+ else:
956
+ subprocess.call(["xdg-open", path])
957
+
958
+ def CopyFileLocation(self, event):
959
+ sel = self.listbox.GetSelections()
960
+ if sel:
961
+ path = self.files[sel[0]]
962
+ if wx.TheClipboard.Open():
963
+ wx.TheClipboard.SetData(wx.TextDataObject(path))
964
+ wx.TheClipboard.Close()
965
+
966
+ def OpenFileLocation(self, event):
967
+ sel = self.listbox.GetSelections()
968
+ if sel:
969
+ folder = os.path.dirname(self.files[sel[0]])
970
+ if platform.system() == "Windows":
971
+ subprocess.Popen(f'explorer "{folder}"')
972
+ elif platform.system() == "Darwin":
973
+ subprocess.call(["open", folder])
974
+ else:
975
+ subprocess.call(["xdg-open", folder])
976
+
977
+
978
+ def OnRightClick(self, event):
979
+ if self.listbox.GetSelections():
980
+ self.PopupMenu(self.popup_menu, event.GetPosition())
981
+
982
+ def StartParser(self, event):
983
+ if not self.files:
984
+ wx.MessageBox("Please select files first.", "Hinweis", wx.OK | wx.ICON_INFORMATION)
985
+ wx.CallAfter(self.start_btn.Enable) # <-- wieder aktivieren
986
+ return
987
+
988
+
989
+ self.start_btn.Disable()
990
+ self.stop_flag.clear()
991
+ self.prog_ctrl.Clear()
992
+
993
+ def error_callback(msg):
994
+ wx.CallAfter(self.AppendProg, msg)
995
+
996
+ def update_total_pages_live(new_total):
997
+ wx.CallAfter(self.lbl_total_pages.SetLabel, f"Total pages: {new_total}")
998
+
999
+
1000
+ page_info, total_pages = get_total_pages(
1001
+ self.files,
1002
+ error_callback=error_callback,
1003
+ progress_callback=update_total_pages_live
1004
+ )
1005
+
1006
+ if total_pages == 0:
1007
+ self.AppendProg("[INFO] No pages found.\n")
1008
+ wx.CallAfter(self.start_btn.Enable) # <-- wieder aktivieren
1009
+ return
1010
+
1011
+ tracker = StatusTracker(total_pages)
1012
+
1013
+ def gui_progress_callback(status):
1014
+ wx.CallAfter(self.lbl_processed_pages.SetLabel, f"Processed pages: {status['processed_pages']}")
1015
+ wx.CallAfter(self.lbl_total_pages.SetLabel, f"Total pages: {status['total_pages']}")
1016
+ wx.CallAfter(self.lbl_pages_per_sec.SetLabel, f"Pages/sec: {status['pages_per_sec']:}")
1017
+ wx.CallAfter(self.lbl_est_time.SetLabel, f"Estimated time (min): {status['est_time']:}")
1018
+ wx.CallAfter(self.lbl_elapsed_time.SetLabel, f"Elapsed time: {status['elapsed_time']}")
1019
+
1020
+ throttled_gui_callback = throttle_callback(gui_progress_callback, 100)
1021
+
1022
+ def background():
1023
+ small = [p for p in page_info if p[1] <= PARALLEL_THRESHOLD]
1024
+ large = [p for p in page_info if p[1] > PARALLEL_THRESHOLD]
1025
+
1026
+ # Verarbeite kleine Dateien je in einem eigenen Prozess
1027
+ if small:
1028
+ max_workers = max(1, min(len(small), get_physical_cores()))
1029
+ with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
1030
+ futures = {}
1031
+ for path, count in small:
1032
+ if self.stop_flag.is_set():
1033
+ break
1034
+ future = executor.submit(save_pdf, path, count, None, False, None)
1035
+ futures[future] = (path, count)
1036
+
1037
+ for future in concurrent.futures.as_completed(futures):
1038
+ if self.stop_flag.is_set():
1039
+ break
1040
+ path, count = futures[future]
1041
+ try:
1042
+ pages_processed = future.result()
1043
+ tracker.update(pages_processed)
1044
+ throttled_gui_callback(tracker.get_status())
1045
+ wx.CallAfter(self.AppendProg, f"[INFO] File ready: {path} ({pages_processed} Seiten)\n")
1046
+ except Exception as e:
1047
+ wx.CallAfter(self.AppendProg, f"[ERROR] File {path}: {str(e)}\n")
1048
+
1049
+ # Verarbeite große Dateien Seite für Seite parallel
1050
+ for path, count in large:
1051
+ if self.stop_flag.is_set():
1052
+ break
1053
+
1054
+ try:
1055
+ pages_processed = save_pdf(
1056
+ path,
1057
+ count,
1058
+ tracker,
1059
+ parallel=True,
1060
+ progress_callback=throttled_gui_callback,
1061
+ stop_flag=self.stop_flag
1062
+ )
1063
+ if pages_processed:
1064
+ wx.CallAfter(
1065
+ self.AppendProg,
1066
+ f"[INFO] File ready: {path} ({pages_processed} Seiten)\n"
1067
+ )
1068
+ else:
1069
+ wx.CallAfter(
1070
+ self.AppendProg,
1071
+ f"[INFO] Stopped: {path}\n"
1072
+ )
1073
+ except Exception as e:
1074
+ wx.CallAfter(
1075
+ self.AppendProg,
1076
+ f"[ERROR] File {path}: {str(e)}\n"
1077
+ )
1078
+
1079
+
1080
+
1081
+ wx.CallAfter(self.AppendProg, "\n[INFO] Processing completed.\n")
1082
+ wx.CallAfter(self.start_btn.Enable) # <-- wieder aktivieren
1083
+ self.stop_flag.clear()
1084
+
1085
+ threading.Thread(target=background, daemon=True).start()
1086
+
1087
+
1088
+ def StopParser(self, event):
1089
+ self.stop_flag.set()
1090
+ self.AppendProg("[INFO] Processing Stopped...\n")
1091
+
1092
+
1093
+ def ShowText(self, event):
1094
+ sel = self.listbox.GetSelections()
1095
+ if not sel:
1096
+ return
1097
+ txt_path = os.path.splitext(self.files[sel[0]])[0] + ".txt"
1098
+ self.text_ctrl.Clear()
1099
+ if os.path.exists(txt_path):
1100
+ with open(txt_path, "r", encoding="utf-8", errors="ignore") as f:
1101
+ self.text_ctrl.SetValue(f.read())
1102
+ else:
1103
+ self.text_ctrl.SetValue("[No .txt file found]")
1104
+
1105
+ def AppendProg(self, text):
1106
+ self.prog_ctrl.AppendText(text)
1107
+
1108
+
1109
+ # -------------------- Einstiegspunkt --------------------
1110
+ def main():
1111
+ if len(sys.argv) > 1:
1112
+ pdf_files = sys.argv[1:]
1113
+ page_info, total_pages = get_total_pages(pdf_files)
1114
+ tracker = StatusTracker(total_pages)
1115
+
1116
+ def cli_callback(status):
1117
+ print(json.dumps(status))
1118
+
1119
+ for path, count in page_info:
1120
+ save_pdf(path, count, tracker, parallel=(count > PARALLEL_THRESHOLD), progress_callback=cli_callback)
1121
+ else:
1122
+ app = wx.App(False)
1123
+ frame = FileManager(None)
1124
+ frame.Show()
1125
+ app.MainLoop()
1126
+
1127
+
1128
+ if __name__ == "__main__":
1129
+ multiprocessing.freeze_support()
1130
+ main()
1131
+
1132
+
1133
+
build_v7e.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import subprocess
3
+
4
+ # Hier wird der Pfad zum Entry-Point (dein Skript) definiert
5
+ entry_point = "PDF Parser - Sevenof9_v7e.py" # Passe dies nach Bedarf an
6
+
7
+
8
+ # Der Befehl, der an PyInstaller übergeben wird
9
+ cmd = [
10
+ sys.executable,
11
+ "-m", "PyInstaller",
12
+ "--onefile",
13
+ "--noconfirm",
14
+ "--clean",
15
+ "--noconsole", # Keine Konsole anzeigen (wichtig für GUI-Programme)
16
+
17
+ # External dependencies that need explicit hidden imports
18
+ "--hidden-import", "pdfminer.six",
19
+ "--hidden-import", "joblib",
20
+ "--hidden-import", "joblib.externals.loky.backend.resource_tracker",
21
+ "--hidden-import", "pdfplumber.utils.exceptions",
22
+ "--hidden-import", "pdfminer.layout",
23
+ "--hidden-import", "pdfminer.pdfpage",
24
+ "--hidden-import", "psutil",
25
+ "--hidden-import", "multiprocessing",
26
+ "--hidden-import", "rtree",
27
+ "--hidden-import", "numpy",
28
+ "--hidden-import", "concurrent.futures",
29
+ "--hidden-import", "wx", # This is the correct import for wxPython
30
+
31
+ entry_point
32
+ ]
33
+
34
+ # Der Befehl wird ausgeführt
35
+ try:
36
+ subprocess.run(cmd, check=True)
37
+ print("Kompilierung abgeschlossen.")
38
+ except subprocess.CalledProcessError as e:
39
+ print(f"Fehler bei der Kompilierung: {e}")