Upload 12 files
Browse files- 1 +21 -0
- 2ocr.sh +32 -0
- aidocs.py +155 -0
- jpegdir.py +98 -0
- random/index.html +100 -0
- shove.sh +38 -0
- showfiles +98 -0
- skel.py +143 -0
- summ +0 -0
- summarize2 +415 -0
- tetris32b.html +275 -0
- vttclean.py +74 -0
1
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
FAILURE: Build failed with an exception.
|
| 3 |
+
|
| 4 |
+
* What went wrong:
|
| 5 |
+
Directory '/Users/jim/work/hacks' does not contain a Gradle build.
|
| 6 |
+
|
| 7 |
+
A Gradle build's root directory should contain one of the possible settings files: settings.gradle, settings.gradle.kts, settings.gradle.dcl.It may also contain one of the possible build files: build.gradle, build.gradle.kts, build.gradle.dcl.
|
| 8 |
+
|
| 9 |
+
To create a new Gradle build in this directory run 'gradle init'
|
| 10 |
+
|
| 11 |
+
For more information about the 'init' task, please refer to https://docs.gradle.org/8.12-rc-1/userguide/build_init_plugin.html in the Gradle documentation.
|
| 12 |
+
|
| 13 |
+
For more details on creating a Gradle build, please refer to https://docs.gradle.org/8.12-rc-1/userguide/tutorial_using_tasks.html in the Gradle documentation.
|
| 14 |
+
|
| 15 |
+
* Try:
|
| 16 |
+
> Run gradle init to create a new Gradle build in this directory.
|
| 17 |
+
> Run with --stacktrace option to get the stack trace.
|
| 18 |
+
> Run with --info or --debug option to get more log output.
|
| 19 |
+
> Get more help at https://help.gradle.org.
|
| 20 |
+
|
| 21 |
+
BUILD FAILED in 413ms
|
2ocr.sh
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# Directory containing TIFF files
|
| 4 |
+
INPUT_DIR="atreatiseonlawp00chitgoog_tif"
|
| 5 |
+
OUTPUT_PDF="output_searchable.pdf"
|
| 6 |
+
TEMP_DIR="temp_ocr"
|
| 7 |
+
|
| 8 |
+
# Create a temporary directory to store processed files
|
| 9 |
+
mkdir -p "$TEMP_DIR"
|
| 10 |
+
|
| 11 |
+
# Process each TIFF file
|
| 12 |
+
for file in "$INPUT_DIR"/*.tif; do
|
| 13 |
+
# Extract the filename without extension
|
| 14 |
+
filename=$(basename "$file" .tif)
|
| 15 |
+
|
| 16 |
+
# Run Tesseract on each file and output a PDF for each page
|
| 17 |
+
tesseract "$file" "$TEMP_DIR/$filename" -l eng pdf
|
| 18 |
+
done
|
| 19 |
+
|
| 20 |
+
# Combine all individual page PDFs into a single PDF
|
| 21 |
+
if command -v pdfunite >/dev/null 2>&1; then
|
| 22 |
+
# If pdfunite is available (from poppler-utils), use it
|
| 23 |
+
pdfunite "$TEMP_DIR"/*.pdf "$OUTPUT_PDF"
|
| 24 |
+
else
|
| 25 |
+
# Fallback to using ImageMagick's `convert` if `pdfunite` isn't available
|
| 26 |
+
convert "$TEMP_DIR"/*.pdf "$OUTPUT_PDF"
|
| 27 |
+
fi
|
| 28 |
+
|
| 29 |
+
# Clean up temporary directory
|
| 30 |
+
rm -r "$TEMP_DIR"
|
| 31 |
+
|
| 32 |
+
echo "Searchable PDF created as $OUTPUT_PDF"
|
aidocs.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass, field
|
| 2 |
+
from typing import Optional, List, Dict, Set, Literal, Tuple, NamedTuple, Union
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
import re
|
| 5 |
+
import fnmatch
|
| 6 |
+
import glob
|
| 7 |
+
from itertools import chain
|
| 8 |
+
|
| 9 |
+
@dataclass
|
| 10 |
+
class PathPattern:
|
| 11 |
+
"""Represents either a direct mapping or a wildcard pattern."""
|
| 12 |
+
pattern: str
|
| 13 |
+
target_template: Optional[str] = None
|
| 14 |
+
|
| 15 |
+
@classmethod
|
| 16 |
+
def parse(cls, spec: str) -> 'PathPattern':
|
| 17 |
+
"""Parse path specification into pattern and optional target."""
|
| 18 |
+
if ':' in spec:
|
| 19 |
+
source, target = spec.split(':', 1)
|
| 20 |
+
return cls(source, target)
|
| 21 |
+
return cls(spec)
|
| 22 |
+
|
| 23 |
+
def resolve(self, root_dir: Path) -> List[PathMapping]:
|
| 24 |
+
"""Resolve pattern into concrete path mappings."""
|
| 25 |
+
if self.target_template is not None:
|
| 26 |
+
# Direct mapping case
|
| 27 |
+
return [PathMapping(Path(self.pattern), Path(self.target_template))]
|
| 28 |
+
|
| 29 |
+
# Wildcard pattern case
|
| 30 |
+
matches = []
|
| 31 |
+
for path in glob.glob(self.pattern, recursive=True):
|
| 32 |
+
source = Path(path)
|
| 33 |
+
if source.is_file():
|
| 34 |
+
# For files, maintain relative structure
|
| 35 |
+
relative = source.relative_to(root_dir) if root_dir in source.parents else source
|
| 36 |
+
matches.append(PathMapping(source, relative))
|
| 37 |
+
return matches
|
| 38 |
+
|
| 39 |
+
def validate(self) -> None:
|
| 40 |
+
"""Validate pattern constraints."""
|
| 41 |
+
if self.target_template:
|
| 42 |
+
# Check for path traversal in target
|
| 43 |
+
if '..' in self.target_template:
|
| 44 |
+
raise ValueError(f"Target path '{self.target_template}' cannot contain '..'")
|
| 45 |
+
|
| 46 |
+
# Normalize path separators
|
| 47 |
+
if '\\' in self.target_template:
|
| 48 |
+
raise ValueError(f"Target path must use forward slashes")
|
| 49 |
+
|
| 50 |
+
# Validate wildcard pattern
|
| 51 |
+
if any(c in self.pattern for c in '<>|"'):
|
| 52 |
+
raise ValueError(f"Invalid characters in pattern: {self.pattern}")
|
| 53 |
+
|
| 54 |
+
class WikiTransformer:
|
| 55 |
+
def __init__(self, size_limit: 'SizeSpec', output_dir: Path,
|
| 56 |
+
merge_strategy: MergeStrategy,
|
| 57 |
+
debug: bool = False):
|
| 58 |
+
self.validator = SizeValidator(size_limit)
|
| 59 |
+
self.output_dir = output_dir
|
| 60 |
+
self.merge_strategy = merge_strategy
|
| 61 |
+
self.debug = debug
|
| 62 |
+
self.console = Console()
|
| 63 |
+
self.log = self._setup_logging()
|
| 64 |
+
self.processed_inodes: Set[int] = set()
|
| 65 |
+
self.root_dir = Path.cwd()
|
| 66 |
+
|
| 67 |
+
async def resolve_patterns(self, patterns: List[str]) -> List[PathMapping]:
|
| 68 |
+
"""Resolve all patterns into concrete mappings."""
|
| 69 |
+
mappings = []
|
| 70 |
+
for spec in patterns:
|
| 71 |
+
try:
|
| 72 |
+
pattern = PathPattern.parse(spec)
|
| 73 |
+
pattern.validate()
|
| 74 |
+
resolved = pattern.resolve(self.root_dir)
|
| 75 |
+
if not resolved:
|
| 76 |
+
self.log.warning(f"Pattern '{spec}' matched no files")
|
| 77 |
+
mappings.extend(resolved)
|
| 78 |
+
except ValueError as e:
|
| 79 |
+
self.log.error(f"Invalid pattern '{spec}': {e}")
|
| 80 |
+
continue
|
| 81 |
+
return mappings
|
| 82 |
+
|
| 83 |
+
async def transform(self, patterns: List[str]):
|
| 84 |
+
"""Transform source trees based on patterns and mappings."""
|
| 85 |
+
mappings = await self.resolve_patterns(patterns)
|
| 86 |
+
|
| 87 |
+
if not mappings:
|
| 88 |
+
raise ValueError("No valid paths matched the specified patterns")
|
| 89 |
+
|
| 90 |
+
if not self.merge_strategy.validate_target(self.output_dir):
|
| 91 |
+
raise ValueError(
|
| 92 |
+
f"Target filesystem doesn't support {self.merge_strategy.link_type} links"
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
| 96 |
+
|
| 97 |
+
with Progress() as progress:
|
| 98 |
+
task = progress.add_task(
|
| 99 |
+
"[green]Processing files...",
|
| 100 |
+
total=len(mappings)
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
for mapping in mappings:
|
| 104 |
+
try:
|
| 105 |
+
await self.process_mapping(mapping)
|
| 106 |
+
progress.update(task, advance=1)
|
| 107 |
+
except Exception as e:
|
| 108 |
+
self.log.error(f"Failed to process {mapping}: {e}")
|
| 109 |
+
|
| 110 |
+
@click.command()
|
| 111 |
+
@click.argument('patterns', nargs=-1, required=True,
|
| 112 |
+
help="Path patterns (e.g., 'src:docs/api' or '**/*.md')")
|
| 113 |
+
@click.option('-l', '--limit', type=SIZE, default='1M',
|
| 114 |
+
help='Per-document size limit (e.g., 500K, 2M, 1G)')
|
| 115 |
+
@click.option('-d', '--debug', is_flag=True, help='Enable debug logging')
|
| 116 |
+
@click.option('-o', '--output-dir', type=click.Path(), default='wiki',
|
| 117 |
+
help='Output directory')
|
| 118 |
+
@click.option('--link-type', type=click.Choice(['symlink', 'hardlink', 'copy']),
|
| 119 |
+
default='symlink', help='File linking strategy')
|
| 120 |
+
@click.option('--follow-links/--no-follow-links', default=False,
|
| 121 |
+
help='Follow symbolic links during traversal')
|
| 122 |
+
def main(patterns: List[str], limit: SizeSpec, debug: bool,
|
| 123 |
+
output_dir: str, link_type: str, follow_links: bool):
|
| 124 |
+
"""Transform files into wiki structure using patterns or mappings.
|
| 125 |
+
|
| 126 |
+
PATTERNS can be either:
|
| 127 |
+
1. Colon-separated mappings: 'source:target'
|
| 128 |
+
2. Wildcard patterns: '**/*.md', 'docs/**/*.rst'
|
| 129 |
+
|
| 130 |
+
Examples:
|
| 131 |
+
# Explicit mapping
|
| 132 |
+
wiki_transform.py src/api:docs/api docs/intro:guide/start
|
| 133 |
+
|
| 134 |
+
# Wildcard patterns
|
| 135 |
+
wiki_transform.py '**/*.md' 'docs/**/*.rst'
|
| 136 |
+
|
| 137 |
+
# Mixed usage
|
| 138 |
+
wiki_transform.py src:api '**/*.md' 'legacy:archive'
|
| 139 |
+
"""
|
| 140 |
+
strategy = MergeStrategy(
|
| 141 |
+
link_type=None if link_type == 'copy' else link_type,
|
| 142 |
+
follow_links=follow_links
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
transformer = WikiTransformer(
|
| 146 |
+
size_limit=limit,
|
| 147 |
+
output_dir=Path(output_dir),
|
| 148 |
+
merge_strategy=strategy,
|
| 149 |
+
debug=debug
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
asyncio.run(transformer.transform(patterns))
|
| 153 |
+
|
| 154 |
+
if __name__ == '__main__':
|
| 155 |
+
main()
|
jpegdir.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from PIL import Image
|
| 3 |
+
import pytesseract
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
import json
|
| 6 |
+
from typing import Dict, List
|
| 7 |
+
from concurrent.futures import ProcessPoolExecutor
|
| 8 |
+
import multiprocessing
|
| 9 |
+
|
| 10 |
+
def process_image(args) -> tuple:
|
| 11 |
+
"""
|
| 12 |
+
Process a single image file.
|
| 13 |
+
|
| 14 |
+
Args:
|
| 15 |
+
args: Tuple of (filename, input_dir, output_dir)
|
| 16 |
+
Returns:
|
| 17 |
+
Tuple of (filename, extracted_text)
|
| 18 |
+
"""
|
| 19 |
+
filename, input_dir, output_dir = args
|
| 20 |
+
try:
|
| 21 |
+
# Full path to image
|
| 22 |
+
image_path = os.path.join(input_dir, filename)
|
| 23 |
+
|
| 24 |
+
# Open and process image
|
| 25 |
+
with Image.open(image_path) as img:
|
| 26 |
+
# Extract text using pytesseract
|
| 27 |
+
text = pytesseract.image_to_string(img)
|
| 28 |
+
|
| 29 |
+
# Save individual text file
|
| 30 |
+
text_filename = Path(filename).stem + '.txt'
|
| 31 |
+
text_path = os.path.join(output_dir, text_filename)
|
| 32 |
+
with open(text_path, 'w', encoding='utf-8') as f:
|
| 33 |
+
f.write(text)
|
| 34 |
+
|
| 35 |
+
print(f"Processed: {filename}")
|
| 36 |
+
return filename, text
|
| 37 |
+
|
| 38 |
+
except Exception as e:
|
| 39 |
+
print(f"Error processing {filename}: {str(e)}")
|
| 40 |
+
return filename, f"ERROR: {str(e)}"
|
| 41 |
+
|
| 42 |
+
def process_directory(input_dir: str, output_dir: str, max_workers: int = None) -> Dict[str, str]:
|
| 43 |
+
"""
|
| 44 |
+
Process all JPEG files in a directory and perform OCR using multiple processes.
|
| 45 |
+
|
| 46 |
+
Args:
|
| 47 |
+
input_dir: Directory containing JPEG files
|
| 48 |
+
output_dir: Directory to save OCR results
|
| 49 |
+
max_workers: Maximum number of worker processes (defaults to CPU count)
|
| 50 |
+
|
| 51 |
+
Returns:
|
| 52 |
+
Dictionary mapping filenames to extracted text
|
| 53 |
+
"""
|
| 54 |
+
# Create output directory if it doesn't exist
|
| 55 |
+
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
| 56 |
+
|
| 57 |
+
# If max_workers not specified, use CPU count
|
| 58 |
+
if max_workers is None:
|
| 59 |
+
max_workers = multiprocessing.cpu_count()
|
| 60 |
+
|
| 61 |
+
# Supported image extensions
|
| 62 |
+
valid_extensions = {'.jpg', '.jpeg', '.JPG', '.JPEG'}
|
| 63 |
+
|
| 64 |
+
# Get list of valid image files
|
| 65 |
+
image_files = [
|
| 66 |
+
f for f in os.listdir(input_dir)
|
| 67 |
+
if Path(f).suffix in valid_extensions
|
| 68 |
+
]
|
| 69 |
+
|
| 70 |
+
# Prepare arguments for worker processes
|
| 71 |
+
work_args = [(f, input_dir, output_dir) for f in image_files]
|
| 72 |
+
|
| 73 |
+
# Process files concurrently
|
| 74 |
+
results = {}
|
| 75 |
+
with ProcessPoolExecutor(max_workers=max_workers) as executor:
|
| 76 |
+
for filename, text in executor.map(process_image, work_args):
|
| 77 |
+
results[filename] = text
|
| 78 |
+
|
| 79 |
+
# Save consolidated results to JSON
|
| 80 |
+
json_path = os.path.join(output_dir, 'ocr_results.json')
|
| 81 |
+
with open(json_path, 'w', encoding='utf-8') as f:
|
| 82 |
+
json.dump(results, f, indent=2, ensure_ascii=False)
|
| 83 |
+
|
| 84 |
+
return results
|
| 85 |
+
|
| 86 |
+
if __name__ == "__main__":
|
| 87 |
+
import argparse
|
| 88 |
+
|
| 89 |
+
parser = argparse.ArgumentParser(description='Perform OCR on all JPEG files in a directory')
|
| 90 |
+
parser.add_argument('input_dir', help='Input directory containing JPEG files')
|
| 91 |
+
parser.add_argument('output_dir', help='Output directory for OCR results')
|
| 92 |
+
parser.add_argument('--workers', type=int, help='Number of worker processes (default: CPU count)',
|
| 93 |
+
default=None)
|
| 94 |
+
|
| 95 |
+
args = parser.parse_args()
|
| 96 |
+
|
| 97 |
+
results = process_directory(args.input_dir, args.output_dir, args.workers)
|
| 98 |
+
print(f"\nProcessed {len(results)} files. Results saved to {args.output_dir}")
|
random/index.html
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>Force-Directed Voronoi Diagram</title>
|
| 7 |
+
<script src="https://d3js.org/d3.v7.min.js"></script>
|
| 8 |
+
<style>
|
| 9 |
+
body {
|
| 10 |
+
margin: 0;
|
| 11 |
+
overflow: hidden;
|
| 12 |
+
}
|
| 13 |
+
svg {
|
| 14 |
+
display: block;
|
| 15 |
+
}
|
| 16 |
+
</style>
|
| 17 |
+
</head>
|
| 18 |
+
<body>
|
| 19 |
+
<script>
|
| 20 |
+
// Dimensions
|
| 21 |
+
const width = window.innerWidth;
|
| 22 |
+
const height = window.innerHeight;
|
| 23 |
+
|
| 24 |
+
// Create SVG container
|
| 25 |
+
const svg = d3.select("body")
|
| 26 |
+
.append("svg")
|
| 27 |
+
.attr("width", width)
|
| 28 |
+
.attr("height", height);
|
| 29 |
+
|
| 30 |
+
// Random initial dataset
|
| 31 |
+
let data = d3.range(20).map(() => ({
|
| 32 |
+
x: Math.random() * width,
|
| 33 |
+
y: Math.random() * height,
|
| 34 |
+
value: Math.random()
|
| 35 |
+
}));
|
| 36 |
+
|
| 37 |
+
// Force simulation
|
| 38 |
+
const simulation = d3.forceSimulation(data)
|
| 39 |
+
.force("x", d3.forceX(d => d.x).strength(0.5))
|
| 40 |
+
.force("y", d3.forceY(d => d.y).strength(0.5))
|
| 41 |
+
.force("collide", d3.forceCollide(50))
|
| 42 |
+
.on("tick", update);
|
| 43 |
+
|
| 44 |
+
// Voronoi diagram generator
|
| 45 |
+
const voronoi = d3.voronoi()
|
| 46 |
+
.x(d => d.x)
|
| 47 |
+
.y(d => d.y)
|
| 48 |
+
.extent([[0, 0], [width, height]]);
|
| 49 |
+
|
| 50 |
+
// Group for Voronoi cells
|
| 51 |
+
const voronoiGroup = svg.append("g");
|
| 52 |
+
|
| 53 |
+
// Group for circles
|
| 54 |
+
const circleGroup = svg.append("g");
|
| 55 |
+
|
| 56 |
+
function update() {
|
| 57 |
+
const diagram = voronoi(data);
|
| 58 |
+
|
| 59 |
+
// Update Voronoi cells
|
| 60 |
+
const cells = voronoiGroup.selectAll("path")
|
| 61 |
+
.data(data);
|
| 62 |
+
|
| 63 |
+
cells.enter()
|
| 64 |
+
.append("path")
|
| 65 |
+
.merge(cells)
|
| 66 |
+
.attr("d", (d, i) => diagram.renderCell(i))
|
| 67 |
+
.attr("fill", d => d3.interpolateRainbow(d.value))
|
| 68 |
+
.attr("stroke", "#000");
|
| 69 |
+
|
| 70 |
+
cells.exit().remove();
|
| 71 |
+
|
| 72 |
+
// Update circles
|
| 73 |
+
const circles = circleGroup.selectAll("circle")
|
| 74 |
+
.data(data);
|
| 75 |
+
|
| 76 |
+
circles.enter()
|
| 77 |
+
.append("circle")
|
| 78 |
+
.merge(circles)
|
| 79 |
+
.attr("r", 5)
|
| 80 |
+
.attr("fill", "black")
|
| 81 |
+
.attr("cx", d => d.x)
|
| 82 |
+
.attr("cy", d => d.y);
|
| 83 |
+
|
| 84 |
+
circles.exit().remove();
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
// Add a new data point every 2 seconds
|
| 88 |
+
setInterval(() => {
|
| 89 |
+
data.push({
|
| 90 |
+
x: Math.random() * width,
|
| 91 |
+
y: Math.random() * height,
|
| 92 |
+
value: Math.random()
|
| 93 |
+
});
|
| 94 |
+
|
| 95 |
+
simulation.nodes(data);
|
| 96 |
+
simulation.alpha(1).restart();
|
| 97 |
+
}, 2000);
|
| 98 |
+
</script>
|
| 99 |
+
</body>
|
| 100 |
+
</html>
|
shove.sh
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# Variables
|
| 4 |
+
BATCH_SIZE=30
|
| 5 |
+
COUNTER=0
|
| 6 |
+
OUTPUT_DIR="batches"
|
| 7 |
+
RESULTS_FILE="ocr_results.txt"
|
| 8 |
+
|
| 9 |
+
# Create output directory if not exists
|
| 10 |
+
mkdir -p "$OUTPUT_DIR"
|
| 11 |
+
|
| 12 |
+
# Clear results file
|
| 13 |
+
: > "$RESULTS_FILE"
|
| 14 |
+
|
| 15 |
+
# Loop through PNG files in batches of $BATCH_SIZE
|
| 16 |
+
for FILE in output-*.png; do
|
| 17 |
+
# Add file to batch array
|
| 18 |
+
BATCH_FILES+=("$FILE")
|
| 19 |
+
COUNTER=$((COUNTER + 1))
|
| 20 |
+
|
| 21 |
+
# Process batch when size is reached or on the last file
|
| 22 |
+
if (( COUNTER % BATCH_SIZE == 0 || COUNTER == $(ls output-*.png | wc -l) )); then
|
| 23 |
+
# Create batch file name
|
| 24 |
+
BATCH_NAME="${OUTPUT_DIR}/batch_$((COUNTER / BATCH_SIZE)).png"
|
| 25 |
+
|
| 26 |
+
# Use ffmpeg to concatenate files vertically
|
| 27 |
+
ffmpeg -y -i "concat:$(printf '%s|' "${BATCH_FILES[@]}" | sed 's/|$//')" -vf vstack "$BATCH_NAME"
|
| 28 |
+
|
| 29 |
+
# Run easyocr on the concatenated image
|
| 30 |
+
echo "Processing $BATCH_NAME..."
|
| 31 |
+
easyocr -l en -f "$BATCH_NAME" --gpu True >> "$RESULTS_FILE"
|
| 32 |
+
|
| 33 |
+
# Reset batch files array
|
| 34 |
+
BATCH_FILES=()
|
| 35 |
+
fi
|
| 36 |
+
done
|
| 37 |
+
|
| 38 |
+
echo "OCR processing complete. Results saved to $RESULTS_FILE."
|
showfiles
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# Mission Statement:
|
| 4 |
+
# This script displays the contents of specified files with formatted headers.
|
| 5 |
+
# It provides optional file size limits through the -k flag (specified in KB).
|
| 6 |
+
# Without the -k flag, files are shown in their entirety.
|
| 7 |
+
# With -k flag, files larger than the specified size are truncated with a warning.
|
| 8 |
+
# The script handles both Linux and MacOS systems using compatible stat commands.
|
| 9 |
+
# Color output is available via the -c flag for better visual organization.
|
| 10 |
+
|
| 11 |
+
# ANSI color codes
|
| 12 |
+
BLUE='\033[0;34m'
|
| 13 |
+
GREEN='\033[0;32m'
|
| 14 |
+
RED='\033[0;31m'
|
| 15 |
+
NC='\033[0m' # No Color
|
| 16 |
+
|
| 17 |
+
usage() {
|
| 18 |
+
echo "Usage: $(basename $0) [-k size_in_kb] [-c] files..."
|
| 19 |
+
echo " -k: Maximum file size in KB (optional)"
|
| 20 |
+
echo " -c: Enable color output"
|
| 21 |
+
exit 1
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
# Initialize variables
|
| 25 |
+
COLOR=false
|
| 26 |
+
MAX_SIZE_KB=""
|
| 27 |
+
|
| 28 |
+
# Parse command line options
|
| 29 |
+
while getopts "k:c" opt; do
|
| 30 |
+
case $opt in
|
| 31 |
+
k) MAX_SIZE_KB="$OPTARG";;
|
| 32 |
+
c) COLOR=true;;
|
| 33 |
+
?) usage;;
|
| 34 |
+
esac
|
| 35 |
+
done
|
| 36 |
+
|
| 37 |
+
# Shift past the options
|
| 38 |
+
shift $((OPTIND-1))
|
| 39 |
+
|
| 40 |
+
# Check if any files were specified
|
| 41 |
+
if [ $# -eq 0 ]; then
|
| 42 |
+
usage
|
| 43 |
+
fi
|
| 44 |
+
|
| 45 |
+
# Get file size in KB (compatible with both Linux and MacOS)
|
| 46 |
+
get_file_size() {
|
| 47 |
+
if [[ "$OSTYPE" == "darwin"* ]]; then
|
| 48 |
+
stat -f %z "$1"
|
| 49 |
+
else
|
| 50 |
+
stat --format=%s "$1"
|
| 51 |
+
fi
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
# Format and display file header
|
| 55 |
+
show_header() {
|
| 56 |
+
local file="$1"
|
| 57 |
+
local size_bytes=$(get_file_size "$file")
|
| 58 |
+
local size_kb=$((size_bytes / 1024))
|
| 59 |
+
|
| 60 |
+
if $COLOR; then
|
| 61 |
+
echo -e "\n${BLUE}=== File: ${GREEN}$file${BLUE} (${size_kb}KB) ===${NC}"
|
| 62 |
+
else
|
| 63 |
+
echo -e "\n=== File: $file (${size_kb}KB) ==="
|
| 64 |
+
fi
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
# Process each file
|
| 68 |
+
for file in "$@"; do
|
| 69 |
+
if [ ! -f "$file" ]; then
|
| 70 |
+
if $COLOR; then
|
| 71 |
+
echo -e "${RED}Error: '$file' does not exist or is not a regular file${NC}" >&2
|
| 72 |
+
else
|
| 73 |
+
echo "Error: '$file' does not exist or is not a regular file" >&2
|
| 74 |
+
fi
|
| 75 |
+
continue
|
| 76 |
+
fi
|
| 77 |
+
|
| 78 |
+
show_header "$file"
|
| 79 |
+
|
| 80 |
+
if [ -n "$MAX_SIZE_KB" ]; then
|
| 81 |
+
size_bytes=$(get_file_size "$file")
|
| 82 |
+
size_kb=$((size_bytes / 1024))
|
| 83 |
+
|
| 84 |
+
if [ $size_kb -gt $MAX_SIZE_KB ]; then
|
| 85 |
+
if $COLOR; then
|
| 86 |
+
echo -e "${RED}File size ($size_kb KB) exceeds limit ($MAX_SIZE_KB KB). Showing first $MAX_SIZE_KB KB:${NC}"
|
| 87 |
+
else
|
| 88 |
+
echo "File size ($size_kb KB) exceeds limit ($MAX_SIZE_KB KB). Showing first $MAX_SIZE_KB KB:"
|
| 89 |
+
fi
|
| 90 |
+
head -c $((MAX_SIZE_KB * 1024)) "$file"
|
| 91 |
+
echo -e "\n[Truncated...]"
|
| 92 |
+
else
|
| 93 |
+
cat "$file"
|
| 94 |
+
fi
|
| 95 |
+
else
|
| 96 |
+
cat "$file"
|
| 97 |
+
fi
|
| 98 |
+
done
|
skel.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!python3
|
| 2 |
+
import unittest
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
import tempfile
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
class TestSkeletonMapper(unittest.TestCase):
|
| 8 |
+
def setUp(self):
|
| 9 |
+
self.temp_dir = tempfile.mkdtemp()
|
| 10 |
+
self.patterns = create_language_patterns()
|
| 11 |
+
|
| 12 |
+
def create_test_file(self, content: str, extension: str) -> str:
|
| 13 |
+
path = Path(self.temp_dir) / f"test{extension}"
|
| 14 |
+
path.write_text(content)
|
| 15 |
+
return str(path)
|
| 16 |
+
|
| 17 |
+
def test_kotlin_edge_cases(self):
|
| 18 |
+
kotlin_code = '''
|
| 19 |
+
@DslMarker
|
| 20 |
+
annotation class NioProxyDsl
|
| 21 |
+
|
| 22 |
+
interface EnhancedNioProxy<T : Any> {
|
| 23 |
+
val original: T
|
| 24 |
+
fun verifyIdentity(): Boolean = enhanced.equals(original)
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
class ProxyContext {
|
| 28 |
+
private val _events = MutableSharedFlow<ProxyEvent>()
|
| 29 |
+
}
|
| 30 |
+
'''
|
| 31 |
+
file_path = self.create_test_file(kotlin_code, ".kt")
|
| 32 |
+
results = extract_skeleton(file_path, self.patterns)
|
| 33 |
+
|
| 34 |
+
# BUG 1: Missing generic type parameters in class/interface detection
|
| 35 |
+
self.assertIn("interface EnhancedNioProxy<T : Any>", results['interface'])
|
| 36 |
+
|
| 37 |
+
# BUG 2: Property detection fails with initialization
|
| 38 |
+
self.assertIn("val original: T", results['property'])
|
| 39 |
+
|
| 40 |
+
# BUG 3: Annotation detection drops parameters
|
| 41 |
+
self.assertIn("@DslMarker", results['annotation'])
|
| 42 |
+
|
| 43 |
+
def fix_kotlin_patterns():
|
| 44 |
+
return {
|
| 45 |
+
'class': r'^\s*(?:data\s+)?class\s+(\w+)(?:<[^>]+>)?',
|
| 46 |
+
'function': r'^\s*fun\s+(\w+)(?:<[^>]+>)?',
|
| 47 |
+
'property': r'^\s*(?:var|val)\s+(\w+)(?:\s*:\s*[^=]+)?(?:\s*=.+)?',
|
| 48 |
+
'interface': r'^\s*interface\s+(\w+)(?:<[^>]+>)?',
|
| 49 |
+
'annotation': r'^\s*@(\w+)(?:\s*[\w\s.()]+)?',
|
| 50 |
+
'suspend': r'^\s*suspend\s+fun\s+\w+',
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
# Critical fixes for main implementation
|
| 54 |
+
def patch_implementation():
|
| 55 |
+
"""
|
| 56 |
+
Critical patches for identified issues
|
| 57 |
+
"""
|
| 58 |
+
# 1. Fix subprocess handling for large files
|
| 59 |
+
def safe_grep(cmd: str, timeout: int = 30) -> str:
|
| 60 |
+
try:
|
| 61 |
+
return subprocess.run(
|
| 62 |
+
cmd,
|
| 63 |
+
shell=True,
|
| 64 |
+
text=True,
|
| 65 |
+
capture_output=True,
|
| 66 |
+
timeout=timeout
|
| 67 |
+
).stdout
|
| 68 |
+
except subprocess.TimeoutExpired:
|
| 69 |
+
return ""
|
| 70 |
+
|
| 71 |
+
# 2. Fix pattern escaping in grep command
|
| 72 |
+
def escape_grep_pattern(pattern: str) -> str:
|
| 73 |
+
return pattern.replace('(', '\\(').replace(')', '\\)')
|
| 74 |
+
|
| 75 |
+
# 3. Add file encoding handling
|
| 76 |
+
def read_file_safe(file_path: str) -> str:
|
| 77 |
+
try:
|
| 78 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 79 |
+
return f.read()
|
| 80 |
+
except UnicodeDecodeError:
|
| 81 |
+
try:
|
| 82 |
+
with open(file_path, 'r', encoding='latin-1') as f:
|
| 83 |
+
return f.read()
|
| 84 |
+
except Exception:
|
| 85 |
+
return ""
|
| 86 |
+
|
| 87 |
+
return safe_grep, escape_grep_pattern, read_file_safe
|
| 88 |
+
|
| 89 |
+
# Shell script fixes
|
| 90 |
+
def generate_fixed_shell_script():
|
| 91 |
+
return '''
|
| 92 |
+
#!/bin/bash
|
| 93 |
+
|
| 94 |
+
# Fixed file handling
|
| 95 |
+
while IFS= read -r -d '' file; do
|
| 96 |
+
if [[ ! -f "$file" ]]; then
|
| 97 |
+
continue
|
| 98 |
+
fi
|
| 99 |
+
|
| 100 |
+
# Handle filenames with spaces and special chars
|
| 101 |
+
file_ext="${file##*.}"
|
| 102 |
+
file_name=$(printf '%q' "$file")
|
| 103 |
+
|
| 104 |
+
# Prevent grep pattern injection
|
| 105 |
+
safe_grep() {
|
| 106 |
+
local pattern=$1
|
| 107 |
+
local file=$2
|
| 108 |
+
grep -E "^[[:space:]]*${pattern}" "$file" 2>/dev/null || true
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
case "$file_ext" in
|
| 112 |
+
kt|kts)
|
| 113 |
+
safe_grep "(@\\w+|class\\s+\\w+|interface\\s+\\w+|fun\\s+\\w+)" "$file_name"
|
| 114 |
+
;;
|
| 115 |
+
# ... other extensions
|
| 116 |
+
esac
|
| 117 |
+
done < <(find . -type f -print0)
|
| 118 |
+
'''
|
| 119 |
+
|
| 120 |
+
# Runtime monitoring hooks
|
| 121 |
+
def add_monitoring():
|
| 122 |
+
import time
|
| 123 |
+
import psutil
|
| 124 |
+
|
| 125 |
+
def monitor_execution(func):
|
| 126 |
+
def wrapper(*args, **kwargs):
|
| 127 |
+
start = time.time()
|
| 128 |
+
process = psutil.Process()
|
| 129 |
+
mem_before = process.memory_info().rss
|
| 130 |
+
|
| 131 |
+
result = func(*args, **kwargs)
|
| 132 |
+
|
| 133 |
+
elapsed = time.time() - start
|
| 134 |
+
mem_after = process.memory_info().rss
|
| 135 |
+
mem_delta = mem_after - mem_before
|
| 136 |
+
|
| 137 |
+
if elapsed > 5.0 or mem_delta > 100*1024*1024: # 100MB
|
| 138 |
+
print(f"Warning: High resource usage in {func.__name__}")
|
| 139 |
+
|
| 140 |
+
return result
|
| 141 |
+
return wrapper
|
| 142 |
+
|
| 143 |
+
return monitor_execution
|
summ
ADDED
|
File without changes
|
summarize2
ADDED
|
@@ -0,0 +1,415 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
|
| 3 |
+
# CHARTER: This script is designed to process audio/video content, generate transcripts,
|
| 4 |
+
# summaries, and audio files. The following tasks are defined and must not be altered by any LLM:
|
| 5 |
+
# 1. Download content if a URL is provided
|
| 6 |
+
# 2. Generate or use existing VTT transcript
|
| 7 |
+
# 3. Clean the VTT transcript
|
| 8 |
+
# 4. Generate a summary of the transcript
|
| 9 |
+
# 5. Create a 16k VBR Opus OGG file for audio tracks (unless audio download is disabled)
|
| 10 |
+
# 6. Output both the cleaned VTT text and the summary
|
| 11 |
+
# 7. Exclude the WAV file from the results
|
| 12 |
+
# 8. Include the OGG file in the results only if both WAV and OGG were created
|
| 13 |
+
# This charter is unalterable and defines the core functionality of the script.
|
| 14 |
+
|
| 15 |
+
# Configuration (adjust these paths)
|
| 16 |
+
WHISPCC="$HOME/work/whisper.cpp" # ./main to run ; ./models for models
|
| 17 |
+
MODEL_PATH="$WHISPCC/models/ggml-small.en-tdrz.bin"
|
| 18 |
+
OUTPUT_DIR="$HOME/processed_audio"
|
| 19 |
+
CACHE_DIR="/tmp/summarize_cache"
|
| 20 |
+
OLLAMA_MODEL="llama3.1:latest"
|
| 21 |
+
OLLAMA_MODEL="deepseek-coder-v2:16b"
|
| 22 |
+
|
| 23 |
+
# Prompts for different segments
|
| 24 |
+
FIRST_PROMPT="Summarize this beginning part of a transcript in one sentence, then provide bullet points with timestamps (00:00:00 sentence)."
|
| 25 |
+
MIDDLE_PROMPT="Summarize the key points of this part of the transcript in bullet points with timestamps (00:00:00 sentence)."
|
| 26 |
+
LAST_PROMPT="Summarize the main takeaways of this final part of the transcript in bullet points with timestamps (00:00:00 sentence)."
|
| 27 |
+
|
| 28 |
+
# Global variable to track job queue
|
| 29 |
+
JOB_QUEUE=()
|
| 30 |
+
|
| 31 |
+
# Ensure output and cache directories exist
|
| 32 |
+
mkdir -p "$OUTPUT_DIR" "$CACHE_DIR"
|
| 33 |
+
|
| 34 |
+
# Parse command line options
|
| 35 |
+
USE_FABRIC=false
|
| 36 |
+
DISABLE_AUDIO=false
|
| 37 |
+
DURATION=""
|
| 38 |
+
while getopts "fnad:" opt; do
|
| 39 |
+
case $opt in
|
| 40 |
+
f)
|
| 41 |
+
USE_FABRIC=true
|
| 42 |
+
;;
|
| 43 |
+
n)
|
| 44 |
+
DISABLE_AUDIO=true
|
| 45 |
+
;;
|
| 46 |
+
a)
|
| 47 |
+
DISABLE_AUDIO=false
|
| 48 |
+
;;
|
| 49 |
+
d)
|
| 50 |
+
DURATION="$OPTARG"
|
| 51 |
+
;;
|
| 52 |
+
\?)
|
| 53 |
+
echo "Invalid option: -$OPTARG" >&2
|
| 54 |
+
exit 1
|
| 55 |
+
;;
|
| 56 |
+
esac
|
| 57 |
+
done
|
| 58 |
+
shift $((OPTIND-1))
|
| 59 |
+
|
| 60 |
+
# Function to get MD5 hash of a file
|
| 61 |
+
get_md5() {
|
| 62 |
+
md5sum "$1" | cut -d' ' -f1
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
# Function to cache a file using hardlinks (atomic)
|
| 66 |
+
cache_file() {
|
| 67 |
+
local INPUT_FILE="$1"
|
| 68 |
+
local EXTENSION="$2"
|
| 69 |
+
|
| 70 |
+
# Check if the input file exists and is not empty
|
| 71 |
+
if [ ! -s "$INPUT_FILE" ]; then
|
| 72 |
+
echo "Error: Input file is empty or does not exist." >&2
|
| 73 |
+
return 1
|
| 74 |
+
fi
|
| 75 |
+
|
| 76 |
+
local MD5=$(get_md5 "$INPUT_FILE")
|
| 77 |
+
local CACHE_SUBDIR="$CACHE_DIR/${MD5:0:2}/${MD5:2:2}"
|
| 78 |
+
local SAFE_FILENAME=$(echo "$INPUT_FILE" | sed 's/[^a-zA-Z0-9._-]/_/g')
|
| 79 |
+
local CACHE_FILE="$CACHE_SUBDIR/${MD5}_${SAFE_FILENAME}${EXTENSION}"
|
| 80 |
+
|
| 81 |
+
echo "Cache operation: MD5 sum = $MD5" >&2
|
| 82 |
+
echo "Cache file: $CACHE_FILE" >&2
|
| 83 |
+
|
| 84 |
+
# Create cache subdirectory if it doesn't exist
|
| 85 |
+
if ! mkdir -p "$CACHE_SUBDIR"; then
|
| 86 |
+
echo "Error: Failed to create cache subdirectory." >&2
|
| 87 |
+
return 1
|
| 88 |
+
fi
|
| 89 |
+
|
| 90 |
+
# Attempt to create the hardlink
|
| 91 |
+
if ln -f "$INPUT_FILE" "$CACHE_FILE"; then
|
| 92 |
+
echo "Cache file created: $CACHE_FILE" >&2
|
| 93 |
+
echo "$CACHE_FILE"
|
| 94 |
+
return 0
|
| 95 |
+
else
|
| 96 |
+
echo "Error: Failed to create cache file." >&2
|
| 97 |
+
return 1
|
| 98 |
+
fi
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
# Function to sanitize a string for use as a filename
|
| 102 |
+
sanitize_filename() {
|
| 103 |
+
local STRING="$1"
|
| 104 |
+
echo "$STRING" | iconv -c -t ascii//translit | sed 's/[^A-Za-z0-9._-]/_/g' | tr '[:upper:]' '[:lower:]'
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
# Function to clean text from a VTT file
|
| 108 |
+
clean_text() {
|
| 109 |
+
sed 's/<[^>]*>//g' | tr -s ' ' | sed 's/^[ \t]*//;s/[ \t]*$//'
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
# Function to summarize a segment of text
|
| 113 |
+
summarize_segment() {
|
| 114 |
+
local SEGMENT_TEXT="$1"
|
| 115 |
+
local PROMPT="$2"
|
| 116 |
+
local SUMMARY_OUTPUT=""
|
| 117 |
+
|
| 118 |
+
# Count the number of lines in the input
|
| 119 |
+
local LINE_COUNT=$(echo "$SEGMENT_TEXT" | wc -l)
|
| 120 |
+
|
| 121 |
+
# If the input has less than 12 lines, remove cache and return a simple response
|
| 122 |
+
if [ "$LINE_COUNT" -lt 12 ]; then
|
| 123 |
+
local MD5=$(echo "$SEGMENT_TEXT" | md5sum | cut -d' ' -f1)
|
| 124 |
+
local CACHE_SUBDIR="$CACHE_DIR/${MD5:0:2}/${MD5:2:2}"
|
| 125 |
+
rm -f "$CACHE_SUBDIR/$MD5"*
|
| 126 |
+
echo "The input is too short for meaningful summarization. Cache entry removed. Here's the original text:"
|
| 127 |
+
echo "$SEGMENT_TEXT"
|
| 128 |
+
return 0
|
| 129 |
+
fi
|
| 130 |
+
|
| 131 |
+
if $USE_FABRIC; then
|
| 132 |
+
SUMMARY_OUTPUT=$(fabric -p summarize "$SEGMENT_TEXT" 2>&1)
|
| 133 |
+
else
|
| 134 |
+
# Use ollama for summarization
|
| 135 |
+
SUMMARY_OUTPUT=$(ollama run "$OLLAMA_MODEL" "$PROMPT" "$SEGMENT_TEXT" 2>&1)
|
| 136 |
+
fi
|
| 137 |
+
|
| 138 |
+
if [ $? -ne 0 ]; then
|
| 139 |
+
echo "Error in summarization: $SUMMARY_OUTPUT" >&2
|
| 140 |
+
return 1
|
| 141 |
+
fi
|
| 142 |
+
|
| 143 |
+
echo "$SUMMARY_OUTPUT"
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
# Function to add a job to the queue
|
| 147 |
+
add_job() {
|
| 148 |
+
JOB_QUEUE+=("$@")
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
# Function to update the progress bar for a job
|
| 152 |
+
update_job_progress() {
|
| 153 |
+
local JOB_INDEX="$1"
|
| 154 |
+
local TOTAL_STEPS="$2"
|
| 155 |
+
local CURRENT_STEP="$3"
|
| 156 |
+
local JOB_MESSAGE="$4"
|
| 157 |
+
|
| 158 |
+
# ... (Implementation for updating the TUI progress bar)
|
| 159 |
+
# You can use a library like 'whiptail' or 'dialog' for TUI elements
|
| 160 |
+
# Example using echo for now:
|
| 161 |
+
echo "Job $((JOB_INDEX+1))/$JOB_COUNT: $JOB_MESSAGE ($CURRENT_STEP/$TOTAL_STEPS)"
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
# Function to process the job queue
|
| 165 |
+
process_job_queue() {
|
| 166 |
+
local JOB_COUNT=${#JOB_QUEUE[@]}
|
| 167 |
+
echo "Processing job queue ($JOB_COUNT jobs)..."
|
| 168 |
+
for (( i=0; i<JOB_COUNT; i++ )); do
|
| 169 |
+
# Remove update_job_progress calls
|
| 170 |
+
eval "${JOB_QUEUE[$i]}"
|
| 171 |
+
done
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
# Function to process a single segment
|
| 175 |
+
process_segment() {
|
| 176 |
+
local SEGMENT_TEXT="$1"
|
| 177 |
+
local PROMPT="$2"
|
| 178 |
+
local OUTPUT_FILE="$3"
|
| 179 |
+
local SUMMARY_OUTPUT=""
|
| 180 |
+
|
| 181 |
+
# Count the number of lines in the input
|
| 182 |
+
local LINE_COUNT=$(echo "$SEGMENT_TEXT" | wc -l)
|
| 183 |
+
|
| 184 |
+
# If the input has less than 12 lines, remove cache and return a simple response
|
| 185 |
+
if [ "$LINE_COUNT" -lt 12 ]; then
|
| 186 |
+
local MD5=$(echo "$SEGMENT_TEXT" | md5sum | cut -d' ' -f1)
|
| 187 |
+
local CACHE_SUBDIR="$CACHE_DIR/${MD5:0:2}/${MD5:2:2}"
|
| 188 |
+
rm -f "$CACHE_SUBDIR/$MD5"*
|
| 189 |
+
echo "The input is too short for meaningful summarization. Cache entry removed. Here's the original text:"
|
| 190 |
+
echo "$SEGMENT_TEXT" > "$OUTPUT_FILE"
|
| 191 |
+
return 0
|
| 192 |
+
fi
|
| 193 |
+
|
| 194 |
+
if $USE_FABRIC; then
|
| 195 |
+
SUMMARY_OUTPUT=$(fabric -p summarize "$SEGMENT_TEXT" 2>&1)
|
| 196 |
+
else
|
| 197 |
+
# Use ollama for summarization
|
| 198 |
+
SUMMARY_OUTPUT=$(ollama run "$OLLAMA_MODEL" "$PROMPT" "$SEGMENT_TEXT" 2>&1)
|
| 199 |
+
fi
|
| 200 |
+
|
| 201 |
+
if [ $? -ne 0 ]; then
|
| 202 |
+
echo "Error in summarization: $SUMMARY_OUTPUT" >&2
|
| 203 |
+
return 1
|
| 204 |
+
fi
|
| 205 |
+
|
| 206 |
+
# Write the summary to the specified output file
|
| 207 |
+
echo "$SUMMARY_OUTPUT" > "$OUTPUT_FILE"
|
| 208 |
+
}
|
| 209 |
+
|
| 210 |
+
# Function to process a VTT file (generate summary and handle versioning)
|
| 211 |
+
process_vtt() {
|
| 212 |
+
local VTT_FILE=$1
|
| 213 |
+
local URL=$2
|
| 214 |
+
local TEMP_DIR=$(mktemp -d)
|
| 215 |
+
local BASE_NAME="${TEMP_DIR}/temp" # Temporary base name
|
| 216 |
+
local CLEANED_TRANSCRIPT="${BASE_NAME}_cleaned.txt"
|
| 217 |
+
local SUMMARY_FILE="${OUTPUT_DIR}/$(basename "$VTT_FILE" .vtt)_summary.txt"
|
| 218 |
+
|
| 219 |
+
echo "Processing VTT file: $VTT_FILE"
|
| 220 |
+
|
| 221 |
+
# Clean the VTT transcript
|
| 222 |
+
if ! python3 "$(dirname "$0")/vttclean.py" "$VTT_FILE" > "$CLEANED_TRANSCRIPT" 2>"${CLEANED_TRANSCRIPT}.error"; then
|
| 223 |
+
echo "Error: Failed to clean the VTT file. Error log:" >&2
|
| 224 |
+
cat "${CLEANED_TRANSCRIPT}.error" >&2
|
| 225 |
+
exit 1
|
| 226 |
+
fi
|
| 227 |
+
|
| 228 |
+
# Check if the cleaned transcript is empty
|
| 229 |
+
if [ ! -s "$CLEANED_TRANSCRIPT" ]; then
|
| 230 |
+
echo "Error: Cleaned transcript is empty." >&2
|
| 231 |
+
exit 1
|
| 232 |
+
fi
|
| 233 |
+
|
| 234 |
+
# Generate summary
|
| 235 |
+
echo "Summarizing transcript..."
|
| 236 |
+
local TOTAL_LINES=$(wc -l < "$CLEANED_TRANSCRIPT")
|
| 237 |
+
local SEGMENT_SIZE=$((TOTAL_LINES / 3))
|
| 238 |
+
local FIRST_SEGMENT=$(head -n $SEGMENT_SIZE "$CLEANED_TRANSCRIPT")
|
| 239 |
+
local MIDDLE_SEGMENT=$(sed -n "$((SEGMENT_SIZE + 1)),$((2 * SEGMENT_SIZE))p" "$CLEANED_TRANSCRIPT")
|
| 240 |
+
local LAST_SEGMENT=$(tail -n $SEGMENT_SIZE "$CLEANED_TRANSCRIPT")
|
| 241 |
+
|
| 242 |
+
{
|
| 243 |
+
echo "Generating summary for first segment..."
|
| 244 |
+
if $USE_FABRIC; then
|
| 245 |
+
fabric -p summarize "$FIRST_SEGMENT"
|
| 246 |
+
else
|
| 247 |
+
ollama run "$OLLAMA_MODEL" "$FIRST_PROMPT" "$FIRST_SEGMENT"
|
| 248 |
+
fi
|
| 249 |
+
|
| 250 |
+
echo "Generating summary for middle segment..."
|
| 251 |
+
if $USE_FABRIC; then
|
| 252 |
+
fabric -p summarize "$MIDDLE_SEGMENT"
|
| 253 |
+
else
|
| 254 |
+
ollama run "$OLLAMA_MODEL" "$MIDDLE_PROMPT" "$MIDDLE_SEGMENT"
|
| 255 |
+
fi
|
| 256 |
+
|
| 257 |
+
echo "Generating summary for last segment..."
|
| 258 |
+
if $USE_FABRIC; then
|
| 259 |
+
fabric -p summarize "$LAST_SEGMENT"
|
| 260 |
+
else
|
| 261 |
+
ollama run "$OLLAMA_MODEL" "$LAST_PROMPT" "$LAST_SEGMENT"
|
| 262 |
+
fi
|
| 263 |
+
} > "$SUMMARY_FILE"
|
| 264 |
+
|
| 265 |
+
if [ ! -s "$SUMMARY_FILE" ]; then
|
| 266 |
+
echo "Error: Summary generation failed." >&2
|
| 267 |
+
exit 1
|
| 268 |
+
fi
|
| 269 |
+
|
| 270 |
+
echo "Summarization complete."
|
| 271 |
+
|
| 272 |
+
# Display the content of the summary file
|
| 273 |
+
echo "Summary content:"
|
| 274 |
+
echo "----------------------------------------"
|
| 275 |
+
cat "$SUMMARY_FILE"
|
| 276 |
+
echo "----------------------------------------"
|
| 277 |
+
|
| 278 |
+
# Clean up
|
| 279 |
+
rm -rf "$TEMP_DIR"
|
| 280 |
+
}
|
| 281 |
+
|
| 282 |
+
# Function to calculate the time difference between two timestamps in HH:MM:SS format
|
| 283 |
+
time_difference() {
|
| 284 |
+
local TIME1="$1" # Format: HH:MM:SS
|
| 285 |
+
local TIME2="$2" # Format: HH:MM:SS
|
| 286 |
+
|
| 287 |
+
# Extract hours, minutes, and seconds from timestamps
|
| 288 |
+
local TIME1_HOUR=$(echo "$TIME1" | cut -d: -f1)
|
| 289 |
+
local TIME1_MINUTE=$(echo "$TIME1" | cut -d: -f2)
|
| 290 |
+
local TIME1_SECOND=$(echo "$TIME1" | cut -d: -f3)
|
| 291 |
+
|
| 292 |
+
local TIME2_HOUR=$(echo "$TIME2" | cut -d: -f1)
|
| 293 |
+
local TIME2_MINUTE=$(echo "$TIME2" | cut -d: -f2)
|
| 294 |
+
local TIME2_SECOND=$(echo "$TIME2" | cut -d: -f3)
|
| 295 |
+
|
| 296 |
+
# Calculate total seconds for each timestamp
|
| 297 |
+
local TIME1_TOTAL_SECONDS=$((TIME1_HOUR * 3600 + TIME1_MINUTE * 60 + TIME1_SECOND))
|
| 298 |
+
local TIME2_TOTAL_SECONDS=$((TIME2_HOUR * 3600 + TIME2_MINUTE * 60 + TIME2_SECOND))
|
| 299 |
+
|
| 300 |
+
# Calculate the difference in seconds
|
| 301 |
+
local DIFF_SECONDS=$((TIME1_TOTAL_SECONDS - TIME2_TOTAL_SECONDS))
|
| 302 |
+
|
| 303 |
+
# Return the difference (could be negative if TIME2 is later than TIME1)
|
| 304 |
+
echo "$DIFF_SECONDS"
|
| 305 |
+
}
|
| 306 |
+
|
| 307 |
+
# Main script logic
|
| 308 |
+
if [ $# -eq 0 ]; then
|
| 309 |
+
echo "Error: No input provided. Please provide a valid URL, VTT file, or a local audio file."
|
| 310 |
+
exit 1
|
| 311 |
+
fi
|
| 312 |
+
|
| 313 |
+
if [[ "$1" == *.vtt ]]; then
|
| 314 |
+
echo "Processing as VTT file..."
|
| 315 |
+
add_job "process_vtt \"$1\" \"$1\""
|
| 316 |
+
elif [[ "$1" == *"http"* ]]; then
|
| 317 |
+
echo "Processing as YouTube URL..."
|
| 318 |
+
|
| 319 |
+
# Extract the video title
|
| 320 |
+
VIDEO_TITLE=$(yt-dlp --get-title "$1")
|
| 321 |
+
FINAL_BASE_NAME=$(sanitize_filename "$VIDEO_TITLE")
|
| 322 |
+
|
| 323 |
+
# Attempt to download subtitles first
|
| 324 |
+
yt-dlp -N 3 --skip-download --write-auto-sub --sub-lang en \
|
| 325 |
+
--cookies-from-browser brave --output "$OUTPUT_DIR/${FINAL_BASE_NAME}.%(ext)s" "$1"
|
| 326 |
+
|
| 327 |
+
VTT_FILE=$(find "$OUTPUT_DIR" -name "${FINAL_BASE_NAME}.vtt" | head -n 1)
|
| 328 |
+
|
| 329 |
+
if [ -n "$VTT_FILE" ]; then
|
| 330 |
+
echo "Subtitles found, processing VTT file..."
|
| 331 |
+
add_job "process_vtt \"$VTT_FILE\" \"$1\""
|
| 332 |
+
else
|
| 333 |
+
echo "No subtitles found, downloading audio and generating transcript..."
|
| 334 |
+
if [ "$DISABLE_AUDIO" = false ]; then
|
| 335 |
+
if ! yt-dlp -N 3 -x --audio-format wav --postprocessor-args "-ar 16k" \
|
| 336 |
+
--cookies-from-browser brave --output "$OUTPUT_DIR/${FINAL_BASE_NAME}.%(ext)s" "$1"; then
|
| 337 |
+
echo "Error: Failed to download audio using yt-dlp. Check the URL and your internet connection." >&2
|
| 338 |
+
exit 1
|
| 339 |
+
fi
|
| 340 |
+
|
| 341 |
+
WAV_FILE=$(find "$OUTPUT_DIR" -name "${FINAL_BASE_NAME}.wav" | head -n 1)
|
| 342 |
+
|
| 343 |
+
if [ -z "$WAV_FILE" ]; then
|
| 344 |
+
echo "Error: WAV file not found after download. Check yt-dlp output." >&2
|
| 345 |
+
exit 1
|
| 346 |
+
fi
|
| 347 |
+
|
| 348 |
+
echo "Running Whisper-CPP to generate VTT transcript..."
|
| 349 |
+
if ! "$WHISPCC"/main -ovtt -tdrz -m "$MODEL_PATH" "$WAV_FILE"; then
|
| 350 |
+
echo "Error: Whisper-CPP transcription failed. Check the model path and audio file." >&2
|
| 351 |
+
exit 1
|
| 352 |
+
fi
|
| 353 |
+
VTT_FILE="${WAV_FILE%.*}.vtt"
|
| 354 |
+
|
| 355 |
+
add_job "process_vtt \"$VTT_FILE\" \"$1\""
|
| 356 |
+
|
| 357 |
+
# Convert WAV to OGG Opus
|
| 358 |
+
echo "Converting WAV to OGG Opus..."
|
| 359 |
+
OGG_FILE="${WAV_FILE%.wav}.ogg"
|
| 360 |
+
if ! ffmpeg -i "$WAV_FILE" -c:a libopus -b:a 16k -vbr on -compression_level 10 -y "$OGG_FILE"; then
|
| 361 |
+
echo "Error: Failed to convert to OGG format." >&2
|
| 362 |
+
exit 1
|
| 363 |
+
fi
|
| 364 |
+
echo " - Audio: $OGG_FILE"
|
| 365 |
+
# Remove the WAV file
|
| 366 |
+
rm "$WAV_FILE"
|
| 367 |
+
fi
|
| 368 |
+
fi
|
| 369 |
+
elif [ -f "$1" ]; then
|
| 370 |
+
echo "Processing as local audio file..."
|
| 371 |
+
INPUT_FILE="$1"
|
| 372 |
+
WAV_FILE="${INPUT_FILE%.*}.wav"
|
| 373 |
+
|
| 374 |
+
# Convert to WAV first if not already WAV
|
| 375 |
+
if [[ "$INPUT_FILE" != *.wav ]]; then
|
| 376 |
+
echo "Converting input to WAV format..."
|
| 377 |
+
if ! ffmpeg -i "$INPUT_FILE" -ar 16000 -ac 1 -c:a pcm_s16le ${DURATION:+-t "$DURATION"} -y "$WAV_FILE"; then
|
| 378 |
+
echo "Error: Failed to convert input to WAV format." >&2
|
| 379 |
+
exit 1
|
| 380 |
+
fi
|
| 381 |
+
else
|
| 382 |
+
WAV_FILE="$INPUT_FILE"
|
| 383 |
+
fi
|
| 384 |
+
|
| 385 |
+
echo "Running Whisper-CPP to generate VTT transcript..."
|
| 386 |
+
if ! "$WHISPCC"/main -ovtt -tdrz -m "$MODEL_PATH" "$WAV_FILE" ; then
|
| 387 |
+
echo "Error: Whisper-CPP transcription failed." >&2
|
| 388 |
+
exit 1
|
| 389 |
+
fi
|
| 390 |
+
|
| 391 |
+
VTT_FILE="${WAV_FILE%.wav}.vtt"
|
| 392 |
+
mv "${WAV_FILE}.vtt" "$VTT_FILE"
|
| 393 |
+
add_job "process_vtt \"$VTT_FILE\" \"$1\""
|
| 394 |
+
|
| 395 |
+
if [ "$DISABLE_AUDIO" = false ]; then
|
| 396 |
+
# Convert to OGG Opus
|
| 397 |
+
echo "Converting to OGG Opus..."
|
| 398 |
+
OGG_FILE="${WAV_FILE%.*}.ogg"
|
| 399 |
+
if ! ffmpeg -i "$WAV_FILE" -c:a libopus -b:a 16k -vbr on -compression_level 10 -y "$OGG_FILE"; then
|
| 400 |
+
echo "Error: Failed to convert to OGG format." >&2
|
| 401 |
+
exit 1
|
| 402 |
+
fi
|
| 403 |
+
echo " - Audio: $OGG_FILE"
|
| 404 |
+
# Remove the WAV file per CHARTER point 7
|
| 405 |
+
rm "$WAV_FILE"
|
| 406 |
+
fi
|
| 407 |
+
|
| 408 |
+
|
| 409 |
+
|
| 410 |
+
else
|
| 411 |
+
echo "Error: Invalid input. Provide a valid URL, VTT file, or a local audio file."
|
| 412 |
+
exit 1
|
| 413 |
+
fi
|
| 414 |
+
|
| 415 |
+
process_job_queue
|
tetris32b.html
ADDED
|
@@ -0,0 +1,275 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>Tetris Game</title>
|
| 7 |
+
<style>
|
| 8 |
+
body {
|
| 9 |
+
display: flex;
|
| 10 |
+
justify-content: center;
|
| 11 |
+
align-items: center;
|
| 12 |
+
height: 100vh;
|
| 13 |
+
margin: 0;
|
| 14 |
+
background-color: #282c34;
|
| 15 |
+
}
|
| 16 |
+
canvas {
|
| 17 |
+
border: 1px solid #fff;
|
| 18 |
+
}
|
| 19 |
+
</style>
|
| 20 |
+
</head>
|
| 21 |
+
<body>
|
| 22 |
+
<canvas id="tetris" width="320" height="640"></canvas>
|
| 23 |
+
<script>
|
| 24 |
+
const canvas = document.getElementById('tetris');
|
| 25 |
+
const context = canvas.getContext('2d');
|
| 26 |
+
|
| 27 |
+
context.scale(20, 20);
|
| 28 |
+
|
| 29 |
+
function arenaSweep() {
|
| 30 |
+
let rowCount = 1;
|
| 31 |
+
outer: for (let y = arena.length - 1; y > 0; --y) {
|
| 32 |
+
for (let x = 0; x < arena[y].length; ++x) {
|
| 33 |
+
if (arena[y][x] === 0) {
|
| 34 |
+
continue outer;
|
| 35 |
+
}
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
const row = arena.splice(y, 1)[0].fill(0);
|
| 39 |
+
arena.unshift(row);
|
| 40 |
+
++y;
|
| 41 |
+
|
| 42 |
+
player.score += rowCount * 10;
|
| 43 |
+
rowCount *= 2;
|
| 44 |
+
}
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
function collide(arena, player) {
|
| 48 |
+
const [m, o] = [player.matrix, player.pos];
|
| 49 |
+
for (let y = 0; y < m.length; ++y) {
|
| 50 |
+
for (let x = 0; x < m[y].length; ++x) {
|
| 51 |
+
if (m[y][x] !== 0 &&
|
| 52 |
+
(arena[y + o.y] &&
|
| 53 |
+
arena[y + o.y][x + o.x]) !== 0) {
|
| 54 |
+
return true;
|
| 55 |
+
}
|
| 56 |
+
}
|
| 57 |
+
}
|
| 58 |
+
return false;
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
function createMatrix(w, h) {
|
| 62 |
+
const matrix = [];
|
| 63 |
+
while (h--) {
|
| 64 |
+
matrix.push(new Array(w).fill(0));
|
| 65 |
+
}
|
| 66 |
+
return matrix;
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
function createPiece(type) {
|
| 70 |
+
if (type === 'T') {
|
| 71 |
+
return [
|
| 72 |
+
[0, 0, 0],
|
| 73 |
+
[1, 1, 1],
|
| 74 |
+
[0, 1, 0],
|
| 75 |
+
];
|
| 76 |
+
} else if (type === 'O') {
|
| 77 |
+
return [
|
| 78 |
+
[2, 2],
|
| 79 |
+
[2, 2],
|
| 80 |
+
];
|
| 81 |
+
} else if (type === 'L') {
|
| 82 |
+
return [
|
| 83 |
+
[0, 3, 0],
|
| 84 |
+
[0, 3, 0],
|
| 85 |
+
[0, 3, 3],
|
| 86 |
+
];
|
| 87 |
+
} else if (type === 'J') {
|
| 88 |
+
return [
|
| 89 |
+
[0, 4, 0],
|
| 90 |
+
[0, 4, 0],
|
| 91 |
+
[4, 4, 0],
|
| 92 |
+
];
|
| 93 |
+
} else if (type === 'I') {
|
| 94 |
+
return [
|
| 95 |
+
[0, 5, 0, 0],
|
| 96 |
+
[0, 5, 0, 0],
|
| 97 |
+
[0, 5, 0, 0],
|
| 98 |
+
[0, 5, 0, 0],
|
| 99 |
+
];
|
| 100 |
+
} else if (type === 'S') {
|
| 101 |
+
return [
|
| 102 |
+
[0, 6, 6],
|
| 103 |
+
[6, 6, 0],
|
| 104 |
+
[0, 0, 0],
|
| 105 |
+
];
|
| 106 |
+
} else if (type === 'Z') {
|
| 107 |
+
return [
|
| 108 |
+
[7, 7, 0],
|
| 109 |
+
[0, 7, 7],
|
| 110 |
+
[0, 0, 0],
|
| 111 |
+
];
|
| 112 |
+
}
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
function draw() {
|
| 116 |
+
context.fillStyle = '#282c34';
|
| 117 |
+
context.fillRect(0, 0, canvas.width, canvas.height);
|
| 118 |
+
|
| 119 |
+
drawMatrix(arena, { x: 0, y: 0 });
|
| 120 |
+
drawMatrix(player.matrix, player.pos);
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
function drawMatrix(matrix, offset) {
|
| 124 |
+
matrix.forEach((row, y) => {
|
| 125 |
+
row.forEach((value, x) => {
|
| 126 |
+
if (value !== 0) {
|
| 127 |
+
context.fillStyle = colors[value];
|
| 128 |
+
context.fillRect(x + offset.x,
|
| 129 |
+
y + offset.y,
|
| 130 |
+
1, 1);
|
| 131 |
+
}
|
| 132 |
+
});
|
| 133 |
+
});
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
function merge(arena, player) {
|
| 137 |
+
player.matrix.forEach((row, y) => {
|
| 138 |
+
row.forEach((value, x) => {
|
| 139 |
+
if (value !== 0) {
|
| 140 |
+
arena[y + player.pos.y][x + player.pos.x] = value;
|
| 141 |
+
}
|
| 142 |
+
});
|
| 143 |
+
});
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
function playerDrop() {
|
| 147 |
+
player.pos.y++;
|
| 148 |
+
if (collide(arena, player)) {
|
| 149 |
+
player.pos.y--;
|
| 150 |
+
merge(arena, player);
|
| 151 |
+
playerReset();
|
| 152 |
+
arenaSweep();
|
| 153 |
+
updateScore();
|
| 154 |
+
}
|
| 155 |
+
dropCounter = 0;
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
function playerMove(dir) {
|
| 159 |
+
player.pos.x += dir;
|
| 160 |
+
if (collide(arena, player)) {
|
| 161 |
+
player.pos.x -= dir;
|
| 162 |
+
}
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
function playerRotate(dir) {
|
| 166 |
+
const pos = player.pos.x;
|
| 167 |
+
let offset = 1;
|
| 168 |
+
rotate(player.matrix, dir);
|
| 169 |
+
while (collide(arena, player)) {
|
| 170 |
+
player.pos.x += offset;
|
| 171 |
+
offset = -(offset + (offset > 0 ? 1 : -1));
|
| 172 |
+
if (offset > player.matrix[0].length) {
|
| 173 |
+
rotate(player.matrix, -dir);
|
| 174 |
+
player.pos.x = pos;
|
| 175 |
+
return;
|
| 176 |
+
}
|
| 177 |
+
}
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
function rotate(matrix, dir) {
|
| 181 |
+
for (let y = 0; y < matrix.length; ++y) {
|
| 182 |
+
for (let x = 0; x < y; ++x) {
|
| 183 |
+
[
|
| 184 |
+
matrix[x][y],
|
| 185 |
+
matrix[y][x],
|
| 186 |
+
] = [
|
| 187 |
+
matrix[y][x],
|
| 188 |
+
matrix[x][y],
|
| 189 |
+
];
|
| 190 |
+
}
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
if (dir > 0) {
|
| 194 |
+
matrix.forEach(row => row.reverse());
|
| 195 |
+
} else {
|
| 196 |
+
matrix.reverse();
|
| 197 |
+
}
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
function playerReset() {
|
| 201 |
+
const pieces = 'ILJOTSZ';
|
| 202 |
+
player.matrix = createPiece(pieces[pieces.length * Math.random() | 0]);
|
| 203 |
+
player.pos.y = 0;
|
| 204 |
+
player.pos.x = (arena[0].length / 2 | 0) -
|
| 205 |
+
(player.matrix[0].length / 2 | 0);
|
| 206 |
+
if (collide(arena, player)) {
|
| 207 |
+
arena.forEach(row => row.fill(0));
|
| 208 |
+
player.score = 0;
|
| 209 |
+
updateScore();
|
| 210 |
+
}
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
let dropCounter = 0;
|
| 214 |
+
let dropInterval = 1000;
|
| 215 |
+
|
| 216 |
+
let lastTime = 0;
|
| 217 |
+
|
| 218 |
+
function update(time = 0) {
|
| 219 |
+
const deltaTime = time - lastTime;
|
| 220 |
+
|
| 221 |
+
dropCounter += deltaTime;
|
| 222 |
+
if (dropCounter > dropInterval) {
|
| 223 |
+
playerDrop();
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
lastTime = time;
|
| 227 |
+
|
| 228 |
+
draw();
|
| 229 |
+
requestAnimationFrame(update);
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
function updateScore() {
|
| 233 |
+
document.getElementById('score').innerText = player.score;
|
| 234 |
+
}
|
| 235 |
+
|
| 236 |
+
const colors = [
|
| 237 |
+
null,
|
| 238 |
+
'#FF0D72',
|
| 239 |
+
'#0DC2FF',
|
| 240 |
+
'#0DFF72',
|
| 241 |
+
'#F538FF',
|
| 242 |
+
'#FF8E0D',
|
| 243 |
+
'#FFE138',
|
| 244 |
+
'#3877FF',
|
| 245 |
+
];
|
| 246 |
+
|
| 247 |
+
const arena = createMatrix(12, 20);
|
| 248 |
+
|
| 249 |
+
const player = {
|
| 250 |
+
pos: {x: 0, y: 0},
|
| 251 |
+
matrix: null,
|
| 252 |
+
score: 0,
|
| 253 |
+
};
|
| 254 |
+
|
| 255 |
+
document.addEventListener('keydown', event => {
|
| 256 |
+
if (event.keyCode === 37) {
|
| 257 |
+
playerMove(-1);
|
| 258 |
+
} else if (event.keyCode === 39) {
|
| 259 |
+
playerMove(1);
|
| 260 |
+
} else if (event.keyCode === 40) {
|
| 261 |
+
playerDrop();
|
| 262 |
+
} else if (event.keyCode === 81) {
|
| 263 |
+
playerRotate(-1);
|
| 264 |
+
} else if (event.keyCode === 87) {
|
| 265 |
+
playerRotate(1);
|
| 266 |
+
}
|
| 267 |
+
});
|
| 268 |
+
|
| 269 |
+
playerReset();
|
| 270 |
+
updateScore();
|
| 271 |
+
update();
|
| 272 |
+
|
| 273 |
+
</script>
|
| 274 |
+
</body>
|
| 275 |
+
</html>
|
vttclean.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
import datetime
|
| 5 |
+
import glob
|
| 6 |
+
import sys
|
| 7 |
+
|
| 8 |
+
def clean_text(text):
|
| 9 |
+
# Remove HTML tags
|
| 10 |
+
text = re.sub(r'<[^>]+>', '', text)
|
| 11 |
+
# Remove multiple spaces
|
| 12 |
+
text = re.sub(r'\s+', ' ', text)
|
| 13 |
+
# Remove leading/trailing whitespace
|
| 14 |
+
return text.strip()
|
| 15 |
+
|
| 16 |
+
def is_prefix(a, b):
|
| 17 |
+
return b.startswith(a)
|
| 18 |
+
|
| 19 |
+
def process_vtt(content):
|
| 20 |
+
# Remove WEBVTT header and metadata
|
| 21 |
+
content = re.sub(r'^WEBVTT\n.*?\n\n', '', content, flags=re.DOTALL)
|
| 22 |
+
|
| 23 |
+
# Split into captions
|
| 24 |
+
captions = re.split(r'\n\n+', content)
|
| 25 |
+
|
| 26 |
+
processed_captions = []
|
| 27 |
+
buffer = []
|
| 28 |
+
|
| 29 |
+
def flush_buffer():
|
| 30 |
+
if buffer:
|
| 31 |
+
processed_captions.append(buffer[-1]) # Keep the last (most complete) line
|
| 32 |
+
buffer.clear()
|
| 33 |
+
|
| 34 |
+
for caption in captions:
|
| 35 |
+
lines = caption.split('\n')
|
| 36 |
+
if len(lines) >= 2:
|
| 37 |
+
# Extract only the start time and remove milliseconds
|
| 38 |
+
timestamp_match = re.match(r'(\d{2}:\d{2}:\d{2})\.(\d{3})', lines[0])
|
| 39 |
+
if timestamp_match:
|
| 40 |
+
timestamp = f"{timestamp_match.group(1)}.{timestamp_match.group(2)}"
|
| 41 |
+
text = ' '.join(lines[1:])
|
| 42 |
+
clean_caption = clean_text(text)
|
| 43 |
+
if clean_caption:
|
| 44 |
+
current_line = f"{timestamp} {clean_caption}"
|
| 45 |
+
|
| 46 |
+
if not buffer:
|
| 47 |
+
buffer.append(current_line)
|
| 48 |
+
else:
|
| 49 |
+
_, prev_text = buffer[-1].split(' ', 1)
|
| 50 |
+
if is_prefix(prev_text, clean_caption):
|
| 51 |
+
buffer.append(current_line)
|
| 52 |
+
else:
|
| 53 |
+
flush_buffer()
|
| 54 |
+
buffer.append(current_line)
|
| 55 |
+
|
| 56 |
+
flush_buffer() # Don't forget to flush the buffer at the end
|
| 57 |
+
|
| 58 |
+
return '\n'.join(processed_captions)
|
| 59 |
+
|
| 60 |
+
if __name__ == "__main__":
|
| 61 |
+
try:
|
| 62 |
+
if len(sys.argv) < 2:
|
| 63 |
+
print("Usage: python vttclean.py <file_pattern>", file=sys.stderr)
|
| 64 |
+
sys.exit(1)
|
| 65 |
+
|
| 66 |
+
file_pattern = sys.argv[1]
|
| 67 |
+
for filename in glob.glob(file_pattern):
|
| 68 |
+
with open(filename, 'r', encoding='utf-8') as file:
|
| 69 |
+
content = file.read()
|
| 70 |
+
result = process_vtt(content)
|
| 71 |
+
print(result)
|
| 72 |
+
except Exception as e:
|
| 73 |
+
print(f"Error processing input: {e}", file=sys.stderr)
|
| 74 |
+
sys.exit(1)
|