Spaces:

piyazon
/

Uyghur-VibeVoice

Sleeping

App Files Files Community

Piyazon commited on 25 days ago

Commit

aae642a

1 Parent(s): df12df5

test

Browse files

Files changed (3) hide show

Dockerfile +34 -0
app.py +7 -0
utils.py +338 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,34 @@

+# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM python:3.12-slim
+# Install system dependencies for torch and audio
+RUN apt-get update && apt-get install -y \
+    ffmpeg \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+    # --- Settings you can override in the Space "Build arguments" ---
+ARG REPO_URL="https://github.com/vibevoice-community/VibeVoice.git"
+ARG REPO_REF="main"    # or a tag/commit for reproducibility
+# Clone the source into /app
+RUN git clone --depth 1 --branch "${REPO_REF}" "${REPO_URL}" /app
+WORKDIR /app
+# Editable install
+RUN pip install --no-cache-dir -e .
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from fastapi import FastAPI
+app = FastAPI()
+@app.get("/")
+def greet_json():
+    return {"Hello": "World!"}

utils.py ADDED Viewed

	@@ -0,0 +1,338 @@

+"""
+Uyghur Text Processing Utilities
+Contains functions for processing Uyghur text, numbers, and script conversion.
+"""
+import unicodedata
+from pypinyin import pinyin, Style
+import re
+from umsc import UgMultiScriptConverter
+# Initialize uyghur script converter
+ug_arab_to_latn = UgMultiScriptConverter('UAS', 'ULS')
+ug_latn_to_arab = UgMultiScriptConverter('ULS', 'UAS')
+def number_to_uyghur_arabic_script(number_str):
+    """
+    Converts a number (integer, decimal, fraction, percentage, or ordinal) up to 9 digits (integer and decimal)
+    to its Uyghur pronunciation in Arabic script. Decimal part is pronounced as a whole number with a fractional term.
+    Ordinals use the -ىنجى suffix for all numbers up to 9 digits, with special forms for single digits.
+    Args:
+        number_str (str): Number as a string (e.g., '123', '0.001', '1/4', '25%', '1968_', '123456789').
+    Returns:
+        str: Uyghur pronunciation in Arabic script.
+    """
+    # Uyghur number words in Arabic script
+    digits = {
+        0: 'نۆل', 1: 'بىر', 2: 'ئىككى', 3: 'ئۈچ', 4: 'تۆت', 5: 'بەش',
+        6: 'ئالتە', 7: 'يەتتە', 8: 'سەككىز', 9: 'توققۇز'
+    }
+    ordinals = {
+        1: 'بىرىنجى', 2: 'ئىككىنجى', 3: 'ئۈچىنجى', 4: 'تۆتىنجى', 5: 'بەشىنجى',
+        6: 'ئالتىنجى', 7: 'يەتتىنجى', 8: 'سەككىزىنجى', 9: 'توققۇزىنجى'
+    }
+    tens = {
+        10: 'ئون', 20: 'يىگىرمە', 30: 'ئوتتۇز', 40: 'قىرىق', 50: 'ئەللىك',
+        60: 'ئاتمىش', 70: 'يەتمىش', 80: 'سەكسەن', 90: 'توقسان'
+    }
+    units = [
+        (1000000000, 'مىليارد'),  # billion
+        (1000000, 'مىليون'),      # million
+        (1000, 'مىڭ'),             # thousand
+        (100, 'يۈز')               # hundred
+    ]
+    fractions = {
+        1: 'ئوندا',         # tenths
+        2: 'يۈزدە',         # hundredths
+        3: 'مىڭدە',         # thousandths
+        4: 'ئون مىڭدە',      # ten-thousandths
+        5: 'يۈز مىڭدە',     # hundred-thousandths
+        6: 'مىليوندا',     # millionths
+        7: 'ئون مىليوندا',  # ten-millionths
+        8: 'يۈز مىليوندا',  # hundred-millionths
+        9: 'مىليارددا'     # billionths
+    }
+    # Convert integer part to words
+    def integer_to_words(num):
+        if num == 0:
+            return digits[0]
+        result = []
+        num = int(num)
+        # Handle large units (billion, million, thousand, hundred)
+        for value, unit_name in units:
+            if num >= value:
+                count = num // value
+                if count == 1 and value >= 100:  # e.g., 100 → "يۈز", not "بىر يۈز"
+                    result.append(unit_name)
+                else:
+                    result.append(integer_to_words(count) + ' ' + unit_name)
+                num %= value
+        # Handle tens and ones
+        if num >= 10 and num in tens:
+            result.append(tens[num])
+        elif num > 10:
+            ten = (num // 10) * 10
+            one = num % 10
+            if one == 0:
+                result.append(tens[ten])
+            else:
+                result.append(tens[ten] + ' ' + digits[one])
+        elif num > 0:
+            result.append(digits[num])
+        return ' '.join(result)
+    # Clean the input (remove commas or spaces)
+    number_str = number_str.replace(',', '').replace(' ', '')
+    # Check for ordinal (ends with '_')
+    is_ordinal = number_str.endswith('_') or number_str.endswith('-')
+    if is_ordinal:
+        number_str = number_str[:-1]  # Remove the _ sign
+        num = int(number_str)
+        if num > 999999999:
+            return number_str
+        if num in ordinals:  # Use special forms for single-digit ordinals
+            return ordinals[num]
+        # Convert to words and modify the last word for ordinal
+        words = integer_to_words(num).split()
+        last_num = num % 100  # Get the last two digits to handle tens and ones
+        if last_num in tens:
+            words[-1] = tens[last_num] + 'ىنجى '  # e.g., 60_ → ئاتمىشىنجى
+        elif last_num % 10 == 0 and last_num > 0:
+            words[-1] = tens[last_num] + 'ىنجى '  # e.g., 60_ → ئاتمىشىنجى
+        else:
+            last_digit = num % 10
+            if last_digit in ordinals:
+                # Replace last digit with ordinal form
+                words[-1] = ordinals[last_digit] + ' '
+            elif last_digit == 0:
+                words[-1] += 'ىنجى'
+        return ' '.join(words)
+    # Check for percentage
+    is_percentage = number_str.endswith('%')
+    if is_percentage:
+        number_str = number_str[:-1]  # Remove the % sign
+    # Check for fraction
+    if '/' in number_str:
+        numerator, denominator = map(int, number_str.split('/'))
+        if numerator in digits and denominator in digits:
+            return f"{digits[denominator]}دە {digits[numerator]}"
+        else:
+            return number_str
+    # Split into integer and decimal parts
+    parts = number_str.split('.')
+    integer_part = parts[0]
+    decimal_part = parts[1] if len(parts) > 1 else None
+    # Validate integer part (up to 9 digits)
+    if len(integer_part) > 9:
+        return number_str
+    # Validate decimal part (up to 9 digits)
+    if decimal_part and len(decimal_part) > 9:
+        return number_str
+    # Convert the integer part
+    pronunciation = integer_to_words(int(integer_part))
+    # Handle decimal part as a whole number with fractional term
+    if decimal_part:
+        pronunciation += ' پۈتۈن'
+        if decimal_part != '0':  # Only pronounce non-zero decimal parts
+            # Remove trailing zeros
+            decimal_value = int(decimal_part.rstrip('0'))
+            # Count significant decimal places
+            decimal_places = len(decimal_part.rstrip('0'))
+            # Fallback for beyond 9 digits
+            fraction_term = fractions.get(decimal_places, 'مىليارددا')
+            pronunciation += ' ' + fraction_term + \
+                ' ' + integer_to_words(decimal_value)
+    # Append percentage term if applicable
+    if is_percentage:
+        pronunciation += ' پىرسەنت'
+    return pronunciation.strip()
+def process_uyghur_text_with_numbers(text):
+    """
+    Processes a string containing Uyghur text and numbers, converting valid numbers to their
+    Uyghur pronunciation in Arabic script while preserving non-numeric text.
+    Args:
+        text (str): Input string with Uyghur text and numbers (e.g., '1/4 كىلو 25% تەملىك').
+    Returns:
+        str: String with numbers converted to Uyghur pronunciation, non-numeric text preserved.
+    """
+    text = text.replace('%', ' پىرسەنت ')
+    # Valid number characters and symbols
+    digits = '0123456789'
+    number_symbols = '/.%_-'
+    result = []
+    i = 0
+    while i < len(text):
+        # Check for spaces and preserve them
+        if text[i].isspace():
+            result.append(text[i])
+            i += 1
+            continue
+        # Try to identify a number (fraction, percentage, ordinal, decimal, or integer)
+        number_start = i
+        number_str = ''
+        is_number = False
+        # Collect potential number characters
+        while i < len(text) and (text[i] in digits or text[i] in number_symbols):
+            number_str += text[i]
+            i += 1
+            is_number = True
+        # If we found a potential number, validate and convert it
+        if is_number:
+            # Check if the string is a valid number format
+            valid = False
+            if '/' in number_str and number_str.count('/') == 1:
+                # Fraction: e.g., "1/4"
+                num, denom = number_str.split('/')
+                if num.isdigit() and denom.isdigit():
+                    valid = True
+            elif number_str.endswith('%'):
+                # Percentage: e.g., "25%"
+                if number_str[:-1].isdigit():
+                    valid = True
+            elif number_str.endswith('_') or number_str.endswith('-'):
+                # Ordinal: e.g., "1_"
+                if number_str[:-1].isdigit():
+                    valid = True
+            elif '.' in number_str and number_str.count('.') == 1:
+                # Decimal: e.g., "3.14"
+                whole, frac = number_str.split('.')
+                if whole.isdigit() and frac.isdigit():
+                    valid = True
+            elif number_str.isdigit():
+                # Integer: e.g., "123"
+                valid = True
+            if valid:
+                try:
+                    # Convert the number to Uyghur pronunciation
+                    converted = number_to_uyghur_arabic_script(number_str)
+                    result.append(converted)
+                except ValueError:
+                    # If conversion fails, append the original number string
+                    result.append(number_str)
+            else:
+                # If not a valid number format, treat as regular text
+                result.append(number_str)
+        else:
+            # Non-number character, append as is
+            result.append(text[i])
+            i += 1
+    # Join the result list into a string
+    return ''.join(result)
+def fix_pauctuations(batch):
+    """
+    Normalize and clean Uyghur text by fixing punctuation and character variants.
+    Args:
+        batch (str): Input text to be normalized.
+    Returns:
+        str: Normalized text with only valid Uyghur characters.
+    """
+    batch = batch.lower()
+    batch = unicodedata.normalize('NFKC', batch)
+    # Replace Uyghur character variants
+    batch = batch.replace('ژ', 'ج')
+    batch = batch.replace('ک', 'ك')
+    batch = batch.replace('ی', 'ى')
+    batch = batch.replace('ه', 'ە')
+    vocab = [" ", "ئ", "ا", "ب", "ت", "ج", "خ", "د", "ر", "ز", "س", "ش", "غ", "ف", "ق", "ك",
+             "ل", "م", "ن", "و", "ى", "ي", "پ", "چ", "ڭ", "گ", "ھ", "ۆ", "ۇ", "ۈ", "ۋ", "ې", "ە"]
+    # Process each character in the batch
+    result = []
+    for char in batch:
+        if char in vocab:
+            result.append(char)
+        elif char in {'.', '?', '؟'}:
+            result.append('  ')  # Replace dot with two spaces
+        else:
+            # Replace other non-vocab characters with one space
+            result.append(' ')
+    # Join the result into a string
+    return ''.join(result)
+def chinese_to_pinyin(mixed_text):
+    """
+    Convert Chinese characters in a mixed-language string to Pinyin without tone marks,
+    preserving non-Chinese text, using only English letters.
+    Args:
+        mixed_text (str): Input string containing Chinese characters and other languages (e.g., English, Uyghur)
+    Returns:
+        str: String with Chinese characters converted to Pinyin (no tone marks), non-Chinese text unchanged
+    """
+    # Regular expression to match Chinese characters (Unicode range for CJK Unified Ideographs)
+    chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
+    def replace_chinese(match):
+        chinese_text = match.group(0)
+        # Convert Chinese to Pinyin without tone marks, join syllables with spaces
+        pinyin_list = pinyin(chinese_text, style=Style.NORMAL)
+        return ' '.join([item[0] for item in pinyin_list])
+    # Replace Chinese characters with their Pinyin, leave other text unchanged
+    result = chinese_pattern.sub(replace_chinese, mixed_text)
+    return result
+def preprocess_uyghur_text(text):
+    """
+    Complete preprocessing pipeline for Uyghur text.
+    Converts Chinese to Pinyin, Latin script to Arabic script, processes numbers, and fixes punctuation.
+    Args:
+        text (str): Input text in any supported format.
+    Returns:
+        str: Fully preprocessed Uyghur text in Arabic script.
+    """
+    # Step 1: Convert Chinese to Pinyin
+    text = chinese_to_pinyin(text)
+    # Step 2: Convert Latin script to Arabic script
+    text = ug_latn_to_arab(text)
+    # Step 3: Process numbers
+    text = process_uyghur_text_with_numbers(text)
+    # Step 4: Fix punctuation and normalize
+    text = fix_pauctuations(text)
+    return text