Spaces:

LocaleNLP
/

english_hausa

Sleeping

App Files Files Community

Mgolo commited on Aug 12

Commit

488ebab

verified ·

1 Parent(s): 366e052

Upload 3 files

Browse files

Files changed (3) hide show

app.py +178 -0
localenpl5.jpeg +0 -0
requirements.txt +14 -0

app.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import gradio as gr
+from transformers import pipeline, MarianTokenizer, AutoModelForSeq2SeqLM
+import torch
+import unicodedata
+import re
+import whisper
+import tempfile
+import os
+import nltk
+nltk.download('punkt')
+from nltk.tokenize import sent_tokenize
+import fitz  # PyMuPDF
+import docx
+from bs4 import BeautifulSoup
+import markdown2
+import chardet
+# Device setup
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+# Load Wolof MarianMT model from HF hub (cached manually)
+translator = None
+whisper_model = None
+def load_wolof_model():
+    global translator
+    if translator is None:
+        model_name = "LocaleNLP/eng_wolof"
+        model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
+        tokenizer = MarianTokenizer.from_pretrained(model_name)
+        translator = pipeline("translation", model=model, tokenizer=tokenizer, device=0 if device.type == 'cuda' else -1)
+    return translator
+def load_whisper_model():
+    global whisper_model
+    if whisper_model is None:
+        whisper_model = whisper.load_model("base")
+    return whisper_model
+def transcribe_audio(audio_file):
+    model = load_whisper_model()
+    if isinstance(audio_file, str):
+        audio_path = audio_file
+    else:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
+            tmp.write(audio_file.read())
+            audio_path = tmp.name
+    result = model.transcribe(audio_path)
+    if not isinstance(audio_file, str):
+        os.remove(audio_path)
+    return result["text"]
+def extract_text_from_file(uploaded_file):
+    # Handle both filepath (str) and file-like object
+    if isinstance(uploaded_file, str):
+        file_path = uploaded_file
+        file_type = file_path.split('.')[-1].lower()
+        with open(file_path, "rb") as f:
+            content = f.read()
+    else:
+        file_type = uploaded_file.name.split('.')[-1].lower()
+        content = uploaded_file.read()
+    if file_type == "pdf":
+        with fitz.open(stream=content, filetype="pdf") as doc:
+            return "\n".join([page.get_text() for page in doc])
+    elif file_type == "docx":
+        if isinstance(uploaded_file, str):
+            doc = docx.Document(file_path)
+        else:
+            doc = docx.Document(uploaded_file)
+        return "\n".join([para.text for para in doc.paragraphs])
+    else:
+        encoding = chardet.detect(content)['encoding']
+        if encoding:
+            content = content.decode(encoding, errors='ignore')
+        if file_type in ("html", "htm"):
+            soup = BeautifulSoup(content, "html.parser")
+            return soup.get_text()
+        elif file_type == "md":
+            html = markdown2.markdown(content)
+            soup = BeautifulSoup(html, "html.parser")
+            return soup.get_text()
+        elif file_type == "srt":
+            return re.sub(r"\d+\n\d{2}:\d{2}:\d{2},\d{3} --> .*?\n", "", content)
+        elif file_type in ("txt", "text"):
+            return content
+        else:
+            raise ValueError("Unsupported file type")
+def translate(text):
+    translator = load_wolof_model()
+    lang_tag = ">>wol<<"
+    paragraphs = text.split("\n")
+    translated_output = []
+    with torch.no_grad():
+        for para in paragraphs:
+            if not para.strip():
+                translated_output.append("")
+                continue
+            sentences = [s.strip() for s in para.split('. ') if s.strip()]
+            formatted = [f"{lang_tag} {s}" for s in sentences]
+            results = translator(formatted,
+                                 max_length=5000,
+                                 num_beams=5,
+                                 early_stopping=True,
+                                 no_repeat_ngram_size=3,
+                                 repetition_penalty=1.5,
+                                 length_penalty=1.2)
+            translated_sentences = [r['translation_text'].capitalize() for r in results]
+            translated_output.append('. '.join(translated_sentences))
+    return "\n".join(translated_output)
+def process_input(input_mode, text, audio_file, file_obj):
+    input_text = ""
+    if input_mode == "Text":
+        input_text = text
+    elif input_mode == "Audio":
+        if audio_file is not None:
+            input_text = transcribe_audio(audio_file)
+    elif input_mode == "File":
+        if file_obj is not None:
+            input_text = extract_text_from_file(file_obj)
+    return input_text
+def translate_and_return(text):
+    if not text.strip():
+        return "No input text to translate."
+    return translate(text)
+# Gradio UI components
+with gr.Blocks() as demo:
+    gr.Markdown("## LocaleNLP English-to-Wolof Translator")
+    gr.Markdown("Upload English text, audio, or document to translate to Wolof using a custom MarianMT model.")
+    with gr.Row():
+        input_mode = gr.Radio(choices=["Text", "Audio", "File"], label="Select input mode", value="Text")
+    input_text = gr.Textbox(label="Enter English text", lines=10, visible=True)
+    audio_input = gr.Audio(label="Upload audio (.wav, .mp3, .m4a)", type="filepath", visible=False)
+    file_input = gr.File(file_types=['.pdf', '.docx', '.html', '.htm', '.md', '.srt', '.txt'], label="Upload document", visible=False)
+    extracted_text = gr.Textbox(label="Extracted / Transcribed Text", lines=10, interactive=False)
+    translate_button = gr.Button("Translate to Wolof")
+    output_text = gr.Textbox(label="Translated Wolof Text", lines=10, interactive=False)
+    def update_visibility(mode):
+        return {
+            input_text: gr.update(visible=(mode=="Text")),
+            audio_input: gr.update(visible=(mode=="Audio")),
+            file_input: gr.update(visible=(mode=="File")),
+            extracted_text: gr.update(value="", visible=True),
+            output_text: gr.update(value="")
+        }
+    input_mode.change(fn=update_visibility, inputs=input_mode, outputs=[input_text, audio_input, file_input, extracted_text, output_text])
+    def handle_process(mode, text, audio, file_obj):
+        try:
+            extracted = process_input(mode, text, audio, file_obj)
+            return extracted, ""
+        except Exception as e:
+            return "", f"Error: {str(e)}"
+    translate_button.click(fn=handle_process, inputs=[input_mode, input_text, audio_input, file_input], outputs=[extracted_text, output_text])
+    def handle_translate(text):
+        return translate_and_return(text)
+    translate_button.click(fn=handle_translate, inputs=extracted_text, outputs=output_text)
+demo.launch()

localenpl5.jpeg ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+altair
+pandas
+streamlit
+transformers
+torch
+openai-whisper
+nltk
+PyMuPDF
+python-docx
+beautifulsoup4
+markdown2
+chardet
+sentencepiece
+sacremoses