| input_dir="txt" | |
| output_dir="sangraha_hi_phonemized" | |
| lang=hi | |
| num_files=50000 | |
| num_jobs=-1 | |
| process_file() { | |
| input_file="$1" | |
| output_file="$2" | |
| lang=hi | |
| # Create the output directory and its parent directories if they don't exist | |
| mkdir -p "$(dirname "$output_file")" | |
| phonemize --quiet -l $lang "$input_file" -o "$output_file" --strip --language-switch remove-flags --preserve-punctuation | |
| echo "Processed: $input_file -> $output_file" | |
| } | |
| export -f process_file | |
| # Start the timer | |
| start_time=$(date +%s) | |
| # Use GNU Parallel with find to process files in parallel | |
| find "$input_dir" -type f -name "*.txt" | head -n $num_files | parallel -j $num_jobs process_file "{}" "${output_dir}/phn_$(basename {})" | |
| # End the timer | |
| end_time=$(date +%s) | |
| # Calculate the elapsed time | |
| elapsed_time=$((end_time - start_time)) | |
| # Convert elapsed time to minutes and seconds | |
| minutes=$((elapsed_time / 60)) | |
| seconds=$((elapsed_time % 60)) | |
| # Print the benchmark results | |
| echo "Benchmark Results:" | |
| echo "Number of files processed: $num_files" | |
| echo "Number of parallel jobs: $num_jobs" | |
| echo "Elapsed time: $minutes minutes $seconds seconds" | |